Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +27 -0
- checkpoint_tokens_1000448.pt +3 -0
- checkpoint_tokens_2000896.pt +3 -0
- checkpoint_tokens_3001344.pt +3 -0
- checkpoint_tokens_4001792.pt +3 -0
- checkpoint_tokens_5002240.pt +3 -0
- checkpoint_tokens_6002688.pt +3 -0
- checkpoint_tokens_7003136.pt +3 -0
- checkpoint_tokens_8003584.pt +3 -0
- checkpoint_tokens_9004032.pt +3 -0
- evaluation/metrics.json +404 -0
- evaluation/metrics_tokens_1000448.json +433 -0
- evaluation/metrics_tokens_2000896.json +433 -0
- evaluation/metrics_tokens_3001344.json +433 -0
- evaluation/metrics_tokens_4001792.json +433 -0
- evaluation/metrics_tokens_5002240.json +433 -0
- evaluation/metrics_tokens_6002688.json +433 -0
- evaluation/metrics_tokens_7003136.json +433 -0
- evaluation/metrics_tokens_8003584.json +433 -0
- evaluation/metrics_tokens_9004032.json +433 -0
- evaluation/plots/kl_divergences_step_1954_tokens_2000896.png +3 -0
- evaluation/plots/kl_divergences_step_2931_tokens_3001344.png +3 -0
- evaluation/plots/kl_divergences_step_3908_tokens_4001792.png +3 -0
- evaluation/plots/kl_divergences_step_4885_tokens_5002240.png +3 -0
- evaluation/plots/kl_divergences_step_5862_tokens_6002688.png +3 -0
- evaluation/plots/kl_divergences_step_6839_tokens_7003136.png +3 -0
- evaluation/plots/kl_divergences_step_7816_tokens_8003584.png +3 -0
- evaluation/plots/kl_divergences_step_8793_tokens_9004032.png +3 -0
- evaluation/plots/kl_divergences_step_977_tokens_1000448.png +3 -0
- evaluation/plots/mae_hidden_states_step_1954_tokens_2000896.png +3 -0
- evaluation/plots/mae_hidden_states_step_2931_tokens_3001344.png +3 -0
- evaluation/plots/mae_hidden_states_step_3908_tokens_4001792.png +3 -0
- evaluation/plots/mae_hidden_states_step_4885_tokens_5002240.png +3 -0
- evaluation/plots/mae_hidden_states_step_5862_tokens_6002688.png +3 -0
- evaluation/plots/mae_hidden_states_step_6839_tokens_7003136.png +3 -0
- evaluation/plots/mae_hidden_states_step_7816_tokens_8003584.png +3 -0
- evaluation/plots/mae_hidden_states_step_8793_tokens_9004032.png +3 -0
- evaluation/plots/mae_hidden_states_step_977_tokens_1000448.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_1954_tokens_2000896.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_2931_tokens_3001344.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_3908_tokens_4001792.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_4885_tokens_5002240.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_5862_tokens_6002688.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_6839_tokens_7003136.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_7816_tokens_8003584.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_8793_tokens_9004032.png +3 -0
- evaluation/plots/multi_dataset_alignment_step_977_tokens_1000448.png +3 -0
- metrics_tokens_1000448.json +24 -0
- metrics_tokens_2000896.json +24 -0
- metrics_tokens_3001344.json +24 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,30 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
evaluation/plots/kl_divergences_step_1954_tokens_2000896.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
evaluation/plots/kl_divergences_step_2931_tokens_3001344.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
evaluation/plots/kl_divergences_step_3908_tokens_4001792.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
evaluation/plots/kl_divergences_step_4885_tokens_5002240.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
evaluation/plots/kl_divergences_step_5862_tokens_6002688.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
evaluation/plots/kl_divergences_step_6839_tokens_7003136.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
evaluation/plots/kl_divergences_step_7816_tokens_8003584.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
evaluation/plots/kl_divergences_step_8793_tokens_9004032.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
evaluation/plots/kl_divergences_step_977_tokens_1000448.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
evaluation/plots/mae_hidden_states_step_1954_tokens_2000896.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
evaluation/plots/mae_hidden_states_step_2931_tokens_3001344.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
evaluation/plots/mae_hidden_states_step_3908_tokens_4001792.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
evaluation/plots/mae_hidden_states_step_4885_tokens_5002240.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
evaluation/plots/mae_hidden_states_step_5862_tokens_6002688.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
evaluation/plots/mae_hidden_states_step_6839_tokens_7003136.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
evaluation/plots/mae_hidden_states_step_7816_tokens_8003584.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
evaluation/plots/mae_hidden_states_step_8793_tokens_9004032.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
evaluation/plots/mae_hidden_states_step_977_tokens_1000448.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
evaluation/plots/multi_dataset_alignment_step_1954_tokens_2000896.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
evaluation/plots/multi_dataset_alignment_step_2931_tokens_3001344.png filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
evaluation/plots/multi_dataset_alignment_step_3908_tokens_4001792.png filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
evaluation/plots/multi_dataset_alignment_step_4885_tokens_5002240.png filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
evaluation/plots/multi_dataset_alignment_step_5862_tokens_6002688.png filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
evaluation/plots/multi_dataset_alignment_step_6839_tokens_7003136.png filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
evaluation/plots/multi_dataset_alignment_step_7816_tokens_8003584.png filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
evaluation/plots/multi_dataset_alignment_step_8793_tokens_9004032.png filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
evaluation/plots/multi_dataset_alignment_step_977_tokens_1000448.png filter=lfs diff=lfs merge=lfs -text
|
checkpoint_tokens_1000448.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:453c2d47e1d2491cfae40e414c0fd4a8cc8084a134bdf62c821738e55bd7779b
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_2000896.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01a142575fb2cd4748c8acf842ac4e80b2a9dda0795dc62e567d0cd8dcef92cd
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_3001344.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:62d4fc045c1ea757d769ad61629ba6c4da447c8f9904795d5baae9e055c5ad1f
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_4001792.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39f094260f810c1a31195dc3ceb2728f4c4fc1523cddddf6e103a0816f210187
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_5002240.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cddc6b26bb6c2790c7a7a53416c5bd57a960e99d4c45f5f7b251733788e75f9
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_6002688.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0581e9e4ff1d9e165bfb5e961d61ea249458419912b27cfa440b74e21e23d08a
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_7003136.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fde3154cede539bbb9c2c0cbb5c2771d247f455ee91547d8f200d9ad2e862cc5
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_8003584.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1f49201248a6845238afe71798950191df59c1d6c7e9730f96de728fa2e46f8
|
| 3 |
+
size 5559210371
|
checkpoint_tokens_9004032.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3059b5873c7f8ddb7438bb51b05b2abf11a007981a222e870b545c93f7e28a10
|
| 3 |
+
size 5559210371
|
evaluation/metrics.json
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"global_step": 8793,
|
| 3 |
+
"n_tokens": 9004032,
|
| 4 |
+
"kl_divergence": {
|
| 5 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.006870746612549,
|
| 6 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.394875526428223,
|
| 7 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 5.912027359008789,
|
| 8 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.115749359130859,
|
| 9 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.151121616363525,
|
| 10 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.175347805023193,
|
| 11 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 12 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.124427318572998,
|
| 13 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.121898651123047,
|
| 14 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.1416893005371094,
|
| 15 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.1991913318634033,
|
| 16 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.14923357963562,
|
| 17 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.216580390930176,
|
| 18 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 19 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.9164533615112305,
|
| 20 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.773134708404541,
|
| 21 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.090576171875,
|
| 22 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.982679843902588,
|
| 23 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.148589134216309,
|
| 24 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.272560119628906,
|
| 25 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 26 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.3844406604766846,
|
| 27 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.425341844558716,
|
| 28 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.334113597869873,
|
| 29 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.3682360649108887,
|
| 30 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.3439788818359375,
|
| 31 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.5122714042663574,
|
| 32 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 33 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4646520614624023,
|
| 34 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.1960129737854004,
|
| 35 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.9887456893920898,
|
| 36 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.074134111404419,
|
| 37 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.142500638961792,
|
| 38 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.9460818767547607,
|
| 39 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 40 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.4264042377471924,
|
| 41 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.065612554550171,
|
| 42 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.84149169921875,
|
| 43 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0016493797302246,
|
| 44 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 2.982909679412842,
|
| 45 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7882883548736572,
|
| 46 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 47 |
+
},
|
| 48 |
+
"mae_hidden_states": {
|
| 49 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.144361138343811,
|
| 50 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.1407876014709473,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.1702628135681152,
|
| 52 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.169557809829712,
|
| 53 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.164795160293579,
|
| 54 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.165663480758667,
|
| 55 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 0.9478356242179871,
|
| 56 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9305350184440613,
|
| 57 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9448918104171753,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9919092059135437,
|
| 59 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9386879801750183,
|
| 60 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9315637350082397,
|
| 61 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 0.9601666331291199,
|
| 62 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.8851673007011414,
|
| 63 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.8906123042106628,
|
| 64 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.8979656100273132,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.8988674283027649,
|
| 66 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.900534451007843,
|
| 67 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.154961109161377,
|
| 68 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.1417714357376099,
|
| 69 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.147143840789795,
|
| 70 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1556771993637085,
|
| 71 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.14786696434021,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.148809790611267,
|
| 73 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 0.9560009837150574,
|
| 74 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9207914471626282,
|
| 75 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9233508110046387,
|
| 76 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.93439781665802,
|
| 77 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.894271194934845,
|
| 78 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9094542264938354,
|
| 79 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.068742275238037,
|
| 80 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0256458520889282,
|
| 81 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0398327112197876,
|
| 82 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.056915283203125,
|
| 83 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0318653583526611,
|
| 84 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.016662359237671
|
| 85 |
+
},
|
| 86 |
+
"alignment": {
|
| 87 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 88 |
+
"mse": 0.328125,
|
| 89 |
+
"mean_cosine_similarity": 0.921875,
|
| 90 |
+
"std_cosine_similarity": 0.1474609375,
|
| 91 |
+
"mean_l2_distance": 16.875,
|
| 92 |
+
"std_l2_distance": 11.6875,
|
| 93 |
+
"mean_dimension_correlation": 0.911785888671875,
|
| 94 |
+
"std_dimension_correlation": 0.029143728520497736,
|
| 95 |
+
"linear_cka": 0.97265625
|
| 96 |
+
},
|
| 97 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 98 |
+
"mse": 0.333984375,
|
| 99 |
+
"mean_cosine_similarity": 0.91796875,
|
| 100 |
+
"std_cosine_similarity": 0.1533203125,
|
| 101 |
+
"mean_l2_distance": 17.0,
|
| 102 |
+
"std_l2_distance": 12.0,
|
| 103 |
+
"mean_dimension_correlation": 0.908795166015625,
|
| 104 |
+
"std_dimension_correlation": 0.030533706065498583,
|
| 105 |
+
"linear_cka": 0.97265625
|
| 106 |
+
},
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 108 |
+
"mse": 0.333984375,
|
| 109 |
+
"mean_cosine_similarity": 0.921875,
|
| 110 |
+
"std_cosine_similarity": 0.1455078125,
|
| 111 |
+
"mean_l2_distance": 17.0,
|
| 112 |
+
"std_l2_distance": 11.4375,
|
| 113 |
+
"mean_dimension_correlation": 0.9114944458007812,
|
| 114 |
+
"std_dimension_correlation": 0.02791911734622361,
|
| 115 |
+
"linear_cka": 0.97265625
|
| 116 |
+
},
|
| 117 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 118 |
+
"mse": 0.333984375,
|
| 119 |
+
"mean_cosine_similarity": 0.91796875,
|
| 120 |
+
"std_cosine_similarity": 0.150390625,
|
| 121 |
+
"mean_l2_distance": 17.0,
|
| 122 |
+
"std_l2_distance": 11.8125,
|
| 123 |
+
"mean_dimension_correlation": 0.910113525390625,
|
| 124 |
+
"std_dimension_correlation": 0.029502623650495614,
|
| 125 |
+
"linear_cka": 0.97265625
|
| 126 |
+
},
|
| 127 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 128 |
+
"mse": 0.330078125,
|
| 129 |
+
"mean_cosine_similarity": 0.921875,
|
| 130 |
+
"std_cosine_similarity": 0.1513671875,
|
| 131 |
+
"mean_l2_distance": 17.0,
|
| 132 |
+
"std_l2_distance": 11.75,
|
| 133 |
+
"mean_dimension_correlation": 0.910638427734375,
|
| 134 |
+
"std_dimension_correlation": 0.029250348976490634,
|
| 135 |
+
"linear_cka": 0.97265625
|
| 136 |
+
},
|
| 137 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 138 |
+
"mse": 0.328125,
|
| 139 |
+
"mean_cosine_similarity": 0.921875,
|
| 140 |
+
"std_cosine_similarity": 0.1474609375,
|
| 141 |
+
"mean_l2_distance": 16.875,
|
| 142 |
+
"std_l2_distance": 11.6875,
|
| 143 |
+
"mean_dimension_correlation": 0.9118026733398438,
|
| 144 |
+
"std_dimension_correlation": 0.02911178290188815,
|
| 145 |
+
"linear_cka": 0.97265625
|
| 146 |
+
},
|
| 147 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 148 |
+
"mse": 0.259765625,
|
| 149 |
+
"mean_cosine_similarity": 0.9375,
|
| 150 |
+
"std_cosine_similarity": 0.1484375,
|
| 151 |
+
"mean_l2_distance": 13.3125,
|
| 152 |
+
"std_l2_distance": 12.4375,
|
| 153 |
+
"mean_dimension_correlation": 0.9281707763671875,
|
| 154 |
+
"std_dimension_correlation": 0.027613594267907524,
|
| 155 |
+
"linear_cka": 0.984375
|
| 156 |
+
},
|
| 157 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 158 |
+
"mse": 0.25390625,
|
| 159 |
+
"mean_cosine_similarity": 0.94140625,
|
| 160 |
+
"std_cosine_similarity": 0.1474609375,
|
| 161 |
+
"mean_l2_distance": 13.0625,
|
| 162 |
+
"std_l2_distance": 12.375,
|
| 163 |
+
"mean_dimension_correlation": 0.929296875,
|
| 164 |
+
"std_dimension_correlation": 0.027428457660098507,
|
| 165 |
+
"linear_cka": 0.984375
|
| 166 |
+
},
|
| 167 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 168 |
+
"mse": 0.26171875,
|
| 169 |
+
"mean_cosine_similarity": 0.9375,
|
| 170 |
+
"std_cosine_similarity": 0.1572265625,
|
| 171 |
+
"mean_l2_distance": 13.375,
|
| 172 |
+
"std_l2_distance": 12.8125,
|
| 173 |
+
"mean_dimension_correlation": 0.9258895874023437,
|
| 174 |
+
"std_dimension_correlation": 0.02807925327640673,
|
| 175 |
+
"linear_cka": 0.984375
|
| 176 |
+
},
|
| 177 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 178 |
+
"mse": 0.2578125,
|
| 179 |
+
"mean_cosine_similarity": 0.9375,
|
| 180 |
+
"std_cosine_similarity": 0.15625,
|
| 181 |
+
"mean_l2_distance": 13.1875,
|
| 182 |
+
"std_l2_distance": 12.875,
|
| 183 |
+
"mean_dimension_correlation": 0.9261764526367188,
|
| 184 |
+
"std_dimension_correlation": 0.029308957111961503,
|
| 185 |
+
"linear_cka": 0.984375
|
| 186 |
+
},
|
| 187 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 188 |
+
"mse": 0.333984375,
|
| 189 |
+
"mean_cosine_similarity": 0.91796875,
|
| 190 |
+
"std_cosine_similarity": 0.1533203125,
|
| 191 |
+
"mean_l2_distance": 17.0,
|
| 192 |
+
"std_l2_distance": 12.0,
|
| 193 |
+
"mean_dimension_correlation": 0.9088623046875,
|
| 194 |
+
"std_dimension_correlation": 0.030521200956836466,
|
| 195 |
+
"linear_cka": 0.97265625
|
| 196 |
+
},
|
| 197 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 198 |
+
"mse": 0.259765625,
|
| 199 |
+
"mean_cosine_similarity": 0.9375,
|
| 200 |
+
"std_cosine_similarity": 0.1484375,
|
| 201 |
+
"mean_l2_distance": 13.3125,
|
| 202 |
+
"std_l2_distance": 12.4375,
|
| 203 |
+
"mean_dimension_correlation": 0.9282363891601563,
|
| 204 |
+
"std_dimension_correlation": 0.02761614875613791,
|
| 205 |
+
"linear_cka": 0.984375
|
| 206 |
+
},
|
| 207 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 208 |
+
"mse": 0.248046875,
|
| 209 |
+
"mean_cosine_similarity": 0.9375,
|
| 210 |
+
"std_cosine_similarity": 0.15625,
|
| 211 |
+
"mean_l2_distance": 12.6875,
|
| 212 |
+
"std_l2_distance": 13.0,
|
| 213 |
+
"mean_dimension_correlation": 0.9286865234375,
|
| 214 |
+
"std_dimension_correlation": 0.028394499325967187,
|
| 215 |
+
"linear_cka": 1.0
|
| 216 |
+
},
|
| 217 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 218 |
+
"mse": 0.251953125,
|
| 219 |
+
"mean_cosine_similarity": 0.9375,
|
| 220 |
+
"std_cosine_similarity": 0.1572265625,
|
| 221 |
+
"mean_l2_distance": 12.875,
|
| 222 |
+
"std_l2_distance": 13.0,
|
| 223 |
+
"mean_dimension_correlation": 0.9273910522460938,
|
| 224 |
+
"std_dimension_correlation": 0.029792982191153054,
|
| 225 |
+
"linear_cka": 1.0
|
| 226 |
+
},
|
| 227 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 228 |
+
"mse": 0.2451171875,
|
| 229 |
+
"mean_cosine_similarity": 0.94140625,
|
| 230 |
+
"std_cosine_similarity": 0.146484375,
|
| 231 |
+
"mean_l2_distance": 12.5625,
|
| 232 |
+
"std_l2_distance": 12.4375,
|
| 233 |
+
"mean_dimension_correlation": 0.9318832397460938,
|
| 234 |
+
"std_dimension_correlation": 0.02779797256144542,
|
| 235 |
+
"linear_cka": 0.984375
|
| 236 |
+
},
|
| 237 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 238 |
+
"mse": 0.333984375,
|
| 239 |
+
"mean_cosine_similarity": 0.921875,
|
| 240 |
+
"std_cosine_similarity": 0.1455078125,
|
| 241 |
+
"mean_l2_distance": 17.0,
|
| 242 |
+
"std_l2_distance": 11.4375,
|
| 243 |
+
"mean_dimension_correlation": 0.9115066528320312,
|
| 244 |
+
"std_dimension_correlation": 0.02783942438110558,
|
| 245 |
+
"linear_cka": 0.97265625
|
| 246 |
+
},
|
| 247 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 248 |
+
"mse": 0.25390625,
|
| 249 |
+
"mean_cosine_similarity": 0.94140625,
|
| 250 |
+
"std_cosine_similarity": 0.1474609375,
|
| 251 |
+
"mean_l2_distance": 13.0625,
|
| 252 |
+
"std_l2_distance": 12.375,
|
| 253 |
+
"mean_dimension_correlation": 0.929364013671875,
|
| 254 |
+
"std_dimension_correlation": 0.027418246966595963,
|
| 255 |
+
"linear_cka": 0.984375
|
| 256 |
+
},
|
| 257 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 258 |
+
"mse": 0.248046875,
|
| 259 |
+
"mean_cosine_similarity": 0.9375,
|
| 260 |
+
"std_cosine_similarity": 0.15625,
|
| 261 |
+
"mean_l2_distance": 12.6875,
|
| 262 |
+
"std_l2_distance": 13.0,
|
| 263 |
+
"mean_dimension_correlation": 0.9286041259765625,
|
| 264 |
+
"std_dimension_correlation": 0.028414978282929146,
|
| 265 |
+
"linear_cka": 1.0
|
| 266 |
+
},
|
| 267 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 268 |
+
"mse": 0.25390625,
|
| 269 |
+
"mean_cosine_similarity": 0.9375,
|
| 270 |
+
"std_cosine_similarity": 0.154296875,
|
| 271 |
+
"mean_l2_distance": 13.0,
|
| 272 |
+
"std_l2_distance": 12.8125,
|
| 273 |
+
"mean_dimension_correlation": 0.927911376953125,
|
| 274 |
+
"std_dimension_correlation": 0.027630500633115,
|
| 275 |
+
"linear_cka": 0.984375
|
| 276 |
+
},
|
| 277 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 278 |
+
"mse": 0.24609375,
|
| 279 |
+
"mean_cosine_similarity": 0.94140625,
|
| 280 |
+
"std_cosine_similarity": 0.1513671875,
|
| 281 |
+
"mean_l2_distance": 12.6875,
|
| 282 |
+
"std_l2_distance": 12.625,
|
| 283 |
+
"mean_dimension_correlation": 0.9304595947265625,
|
| 284 |
+
"std_dimension_correlation": 0.027486156310205404,
|
| 285 |
+
"linear_cka": 0.984375
|
| 286 |
+
},
|
| 287 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 288 |
+
"mse": 0.333984375,
|
| 289 |
+
"mean_cosine_similarity": 0.91796875,
|
| 290 |
+
"std_cosine_similarity": 0.150390625,
|
| 291 |
+
"mean_l2_distance": 17.0,
|
| 292 |
+
"std_l2_distance": 11.8125,
|
| 293 |
+
"mean_dimension_correlation": 0.910162353515625,
|
| 294 |
+
"std_dimension_correlation": 0.029493359043523553,
|
| 295 |
+
"linear_cka": 0.97265625
|
| 296 |
+
},
|
| 297 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 298 |
+
"mse": 0.26171875,
|
| 299 |
+
"mean_cosine_similarity": 0.9375,
|
| 300 |
+
"std_cosine_similarity": 0.1572265625,
|
| 301 |
+
"mean_l2_distance": 13.375,
|
| 302 |
+
"std_l2_distance": 12.8125,
|
| 303 |
+
"mean_dimension_correlation": 0.9259017944335938,
|
| 304 |
+
"std_dimension_correlation": 0.02807177297712858,
|
| 305 |
+
"linear_cka": 0.984375
|
| 306 |
+
},
|
| 307 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 308 |
+
"mse": 0.251953125,
|
| 309 |
+
"mean_cosine_similarity": 0.9375,
|
| 310 |
+
"std_cosine_similarity": 0.1572265625,
|
| 311 |
+
"mean_l2_distance": 12.875,
|
| 312 |
+
"std_l2_distance": 13.0,
|
| 313 |
+
"mean_dimension_correlation": 0.9274307250976562,
|
| 314 |
+
"std_dimension_correlation": 0.029833198285711512,
|
| 315 |
+
"linear_cka": 1.0
|
| 316 |
+
},
|
| 317 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 318 |
+
"mse": 0.25390625,
|
| 319 |
+
"mean_cosine_similarity": 0.9375,
|
| 320 |
+
"std_cosine_similarity": 0.154296875,
|
| 321 |
+
"mean_l2_distance": 13.0,
|
| 322 |
+
"std_l2_distance": 12.8125,
|
| 323 |
+
"mean_dimension_correlation": 0.9279266357421875,
|
| 324 |
+
"std_dimension_correlation": 0.027642045164903522,
|
| 325 |
+
"linear_cka": 0.984375
|
| 326 |
+
},
|
| 327 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 328 |
+
"mse": 0.236328125,
|
| 329 |
+
"mean_cosine_similarity": 0.94140625,
|
| 330 |
+
"std_cosine_similarity": 0.158203125,
|
| 331 |
+
"mean_l2_distance": 12.125,
|
| 332 |
+
"std_l2_distance": 13.25,
|
| 333 |
+
"mean_dimension_correlation": 0.93009033203125,
|
| 334 |
+
"std_dimension_correlation": 0.029206890514525005,
|
| 335 |
+
"linear_cka": 1.0
|
| 336 |
+
},
|
| 337 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 338 |
+
"mse": 0.330078125,
|
| 339 |
+
"mean_cosine_similarity": 0.921875,
|
| 340 |
+
"std_cosine_similarity": 0.1513671875,
|
| 341 |
+
"mean_l2_distance": 17.0,
|
| 342 |
+
"std_l2_distance": 11.75,
|
| 343 |
+
"mean_dimension_correlation": 0.910687255859375,
|
| 344 |
+
"std_dimension_correlation": 0.02925704219094372,
|
| 345 |
+
"linear_cka": 0.97265625
|
| 346 |
+
},
|
| 347 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 348 |
+
"mse": 0.2578125,
|
| 349 |
+
"mean_cosine_similarity": 0.9375,
|
| 350 |
+
"std_cosine_similarity": 0.15625,
|
| 351 |
+
"mean_l2_distance": 13.1875,
|
| 352 |
+
"std_l2_distance": 12.875,
|
| 353 |
+
"mean_dimension_correlation": 0.9263031005859375,
|
| 354 |
+
"std_dimension_correlation": 0.0292820509917565,
|
| 355 |
+
"linear_cka": 0.984375
|
| 356 |
+
},
|
| 357 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 358 |
+
"mse": 0.2451171875,
|
| 359 |
+
"mean_cosine_similarity": 0.94140625,
|
| 360 |
+
"std_cosine_similarity": 0.146484375,
|
| 361 |
+
"mean_l2_distance": 12.5625,
|
| 362 |
+
"std_l2_distance": 12.4375,
|
| 363 |
+
"mean_dimension_correlation": 0.9319442749023438,
|
| 364 |
+
"std_dimension_correlation": 0.02774844077379742,
|
| 365 |
+
"linear_cka": 0.984375
|
| 366 |
+
},
|
| 367 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 368 |
+
"mse": 0.24609375,
|
| 369 |
+
"mean_cosine_similarity": 0.94140625,
|
| 370 |
+
"std_cosine_similarity": 0.1513671875,
|
| 371 |
+
"mean_l2_distance": 12.6875,
|
| 372 |
+
"std_l2_distance": 12.625,
|
| 373 |
+
"mean_dimension_correlation": 0.9305908203125,
|
| 374 |
+
"std_dimension_correlation": 0.027491914687628318,
|
| 375 |
+
"linear_cka": 0.984375
|
| 376 |
+
},
|
| 377 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 378 |
+
"mse": 0.236328125,
|
| 379 |
+
"mean_cosine_similarity": 0.94140625,
|
| 380 |
+
"std_cosine_similarity": 0.158203125,
|
| 381 |
+
"mean_l2_distance": 12.125,
|
| 382 |
+
"std_l2_distance": 13.25,
|
| 383 |
+
"mean_dimension_correlation": 0.9300765991210938,
|
| 384 |
+
"std_dimension_correlation": 0.029167152542476576,
|
| 385 |
+
"linear_cka": 1.0
|
| 386 |
+
},
|
| 387 |
+
"avg_mse": 0.2783203125,
|
| 388 |
+
"std_mse": 0.03847375333442295,
|
| 389 |
+
"avg_mean_cosine_similarity": 0.9328125,
|
| 390 |
+
"std_mean_cosine_similarity": 0.009043622580304863,
|
| 391 |
+
"avg_std_cosine_similarity": 0.15208333333333332,
|
| 392 |
+
"std_std_cosine_similarity": 0.004211187924165684,
|
| 393 |
+
"avg_mean_l2_distance": 14.25,
|
| 394 |
+
"std_mean_l2_distance": 1.9497596005661826,
|
| 395 |
+
"avg_std_l2_distance": 12.420833333333333,
|
| 396 |
+
"std_std_l2_distance": 0.5426913385054979,
|
| 397 |
+
"avg_mean_dimension_correlation": 0.9226060994466144,
|
| 398 |
+
"std_mean_dimension_correlation": 0.00864781898546122,
|
| 399 |
+
"avg_std_dimension_correlation": 0.02859991824384515,
|
| 400 |
+
"std_std_dimension_correlation": 0.00095096984819575,
|
| 401 |
+
"avg_linear_cka": 0.98359375,
|
| 402 |
+
"std_linear_cka": 0.009695057535930357
|
| 403 |
+
}
|
| 404 |
+
}
|
evaluation/metrics_tokens_1000448.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 1000448,
|
| 4 |
+
"global_step": 977,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 0.3984375,
|
| 7 |
+
"train/contrastive": 0.310546875,
|
| 8 |
+
"train/recons_loss": 0.68359375,
|
| 9 |
+
"train/balance_loss": 2.015625,
|
| 10 |
+
"train/balance_loss_contrastive": 1.0,
|
| 11 |
+
"train/balance_loss_recons": 1.015625,
|
| 12 |
+
"train/contrastive_std": 0.015625,
|
| 13 |
+
"train/recons_std": 0.0888671875,
|
| 14 |
+
"train/contrastive_min": 0.27734375,
|
| 15 |
+
"train/contrastive_max": 0.3359375,
|
| 16 |
+
"train/recons_min": 0.58203125,
|
| 17 |
+
"train/recons_max": 0.84375,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.58203125,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.69140625,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.65234375,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.84375,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.63671875,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.6953125,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 1000448,
|
| 27 |
+
"train/step": 977
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 977,
|
| 31 |
+
"n_tokens": 1000448,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.733099460601807,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.016953945159912,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 6.4851484298706055,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.2205095291137695,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.60541296005249,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.584229946136475,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 3.0528693199157715,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 3.0396716594696045,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 3.070789337158203,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 3.154045820236206,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 3.1686503887176514,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 3.2403674125671387,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.6706085205078125,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.970404624938965,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.1444926261901855,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.856858253479004,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.129464149475098,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 5.904314994812012,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 3.781726360321045,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 3.783668041229248,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 3.837904453277588,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 3.7967495918273926,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 3.835503101348877,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 3.970118761062622,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 3.5546975135803223,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 3.486462116241455,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 3.2824597358703613,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 3.3095145225524902,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 3.164924383163452,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 3.2683002948760986,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 5.2000813484191895,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 4.317432403564453,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 4.843194007873535,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 4.330343723297119,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 4.750157356262207,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 4.742369174957275,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.140511155128479,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.1872466802597046,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.1824793815612793,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.1870241165161133,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.1805082559585571,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.1967138051986694,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.3885185718536377,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.3479934930801392,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.3934825658798218,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.4097040891647339,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.3858134746551514,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.3670704364776611,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.3714081048965454,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.335449457168579,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.274694561958313,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.3633638620376587,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.3621580600738525,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.3528505563735962,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 2.7587032318115234,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.6864445209503174,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 2.717672824859619,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.55366587638855,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.7207069396972656,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 2.698686361312866,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.694230079650879,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.7634074687957764,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.6623401641845703,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.6645656824111938,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.5185480117797852,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.6429026126861572,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.6580111980438232,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.689562439918518,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.6670496463775635,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.674801230430603,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.665151834487915,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.5619523525238037
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 1.2421875,
|
| 117 |
+
"mean_cosine_similarity": 0.2119140625,
|
| 118 |
+
"std_cosine_similarity": 0.05712890625,
|
| 119 |
+
"mean_l2_distance": 63.5,
|
| 120 |
+
"std_l2_distance": 2.28125,
|
| 121 |
+
"mean_dimension_correlation": 0.1991757216863334,
|
| 122 |
+
"std_dimension_correlation": 0.1752726942140706,
|
| 123 |
+
"linear_cka": 0.98046875
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 1.2734375,
|
| 127 |
+
"mean_cosine_similarity": 0.171875,
|
| 128 |
+
"std_cosine_similarity": 0.061279296875,
|
| 129 |
+
"mean_l2_distance": 65.0,
|
| 130 |
+
"std_l2_distance": 2.40625,
|
| 131 |
+
"mean_dimension_correlation": 0.1659273698925972,
|
| 132 |
+
"std_dimension_correlation": 0.1742108812082249,
|
| 133 |
+
"linear_cka": 0.98046875
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 1.2421875,
|
| 137 |
+
"mean_cosine_similarity": 0.212890625,
|
| 138 |
+
"std_cosine_similarity": 0.048583984375,
|
| 139 |
+
"mean_l2_distance": 63.5,
|
| 140 |
+
"std_l2_distance": 1.9296875,
|
| 141 |
+
"mean_dimension_correlation": 0.2012754407711327,
|
| 142 |
+
"std_dimension_correlation": 0.1694794786130474,
|
| 143 |
+
"linear_cka": 0.9765625
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 1.28125,
|
| 147 |
+
"mean_cosine_similarity": 0.1640625,
|
| 148 |
+
"std_cosine_similarity": 0.051513671875,
|
| 149 |
+
"mean_l2_distance": 65.5,
|
| 150 |
+
"std_l2_distance": 2.03125,
|
| 151 |
+
"mean_dimension_correlation": 0.15531851844862105,
|
| 152 |
+
"std_dimension_correlation": 0.17483813595973072,
|
| 153 |
+
"linear_cka": 0.97265625
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 1.25,
|
| 157 |
+
"mean_cosine_similarity": 0.203125,
|
| 158 |
+
"std_cosine_similarity": 0.0556640625,
|
| 159 |
+
"mean_l2_distance": 63.75,
|
| 160 |
+
"std_l2_distance": 2.21875,
|
| 161 |
+
"mean_dimension_correlation": 0.19491876736283303,
|
| 162 |
+
"std_dimension_correlation": 0.17604292602805996,
|
| 163 |
+
"linear_cka": 0.98046875
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 1.2421875,
|
| 167 |
+
"mean_cosine_similarity": 0.2119140625,
|
| 168 |
+
"std_cosine_similarity": 0.05712890625,
|
| 169 |
+
"mean_l2_distance": 63.5,
|
| 170 |
+
"std_l2_distance": 2.28125,
|
| 171 |
+
"mean_dimension_correlation": 0.19918374745175244,
|
| 172 |
+
"std_dimension_correlation": 0.1752806618143252,
|
| 173 |
+
"linear_cka": 0.98046875
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 1.25,
|
| 177 |
+
"mean_cosine_similarity": 0.201171875,
|
| 178 |
+
"std_cosine_similarity": 0.0576171875,
|
| 179 |
+
"mean_l2_distance": 64.0,
|
| 180 |
+
"std_l2_distance": 2.3125,
|
| 181 |
+
"mean_dimension_correlation": 0.19001486599445344,
|
| 182 |
+
"std_dimension_correlation": 0.17611495516941666,
|
| 183 |
+
"linear_cka": 0.98046875
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 1.2421875,
|
| 187 |
+
"mean_cosine_similarity": 0.216796875,
|
| 188 |
+
"std_cosine_similarity": 0.056884765625,
|
| 189 |
+
"mean_l2_distance": 63.25,
|
| 190 |
+
"std_l2_distance": 2.296875,
|
| 191 |
+
"mean_dimension_correlation": 0.19663777351379394,
|
| 192 |
+
"std_dimension_correlation": 0.1757753968790023,
|
| 193 |
+
"linear_cka": 0.9765625
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 1.28125,
|
| 197 |
+
"mean_cosine_similarity": 0.162109375,
|
| 198 |
+
"std_cosine_similarity": 0.05517578125,
|
| 199 |
+
"mean_l2_distance": 65.5,
|
| 200 |
+
"std_l2_distance": 2.15625,
|
| 201 |
+
"mean_dimension_correlation": 0.15222867031116039,
|
| 202 |
+
"std_dimension_correlation": 0.17404282759337691,
|
| 203 |
+
"linear_cka": 0.96875
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 1.28125,
|
| 207 |
+
"mean_cosine_similarity": 0.1728515625,
|
| 208 |
+
"std_cosine_similarity": 0.050048828125,
|
| 209 |
+
"mean_l2_distance": 65.0,
|
| 210 |
+
"std_l2_distance": 1.9609375,
|
| 211 |
+
"mean_dimension_correlation": 0.16632139831781387,
|
| 212 |
+
"std_dimension_correlation": 0.17677271492333846,
|
| 213 |
+
"linear_cka": 0.984375
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 1.2734375,
|
| 217 |
+
"mean_cosine_similarity": 0.171875,
|
| 218 |
+
"std_cosine_similarity": 0.061279296875,
|
| 219 |
+
"mean_l2_distance": 65.0,
|
| 220 |
+
"std_l2_distance": 2.40625,
|
| 221 |
+
"mean_dimension_correlation": 0.16591697484254836,
|
| 222 |
+
"std_dimension_correlation": 0.17420606946071834,
|
| 223 |
+
"linear_cka": 0.98046875
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 1.25,
|
| 227 |
+
"mean_cosine_similarity": 0.201171875,
|
| 228 |
+
"std_cosine_similarity": 0.0576171875,
|
| 229 |
+
"mean_l2_distance": 64.0,
|
| 230 |
+
"std_l2_distance": 2.3125,
|
| 231 |
+
"mean_dimension_correlation": 0.1900260180234909,
|
| 232 |
+
"std_dimension_correlation": 0.17610506497287914,
|
| 233 |
+
"linear_cka": 0.98046875
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 1.2734375,
|
| 237 |
+
"mean_cosine_similarity": 0.1796875,
|
| 238 |
+
"std_cosine_similarity": 0.058349609375,
|
| 239 |
+
"mean_l2_distance": 65.0,
|
| 240 |
+
"std_l2_distance": 2.3125,
|
| 241 |
+
"mean_dimension_correlation": 0.16930750142782927,
|
| 242 |
+
"std_dimension_correlation": 0.17924016132496154,
|
| 243 |
+
"linear_cka": 0.984375
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 1.28125,
|
| 247 |
+
"mean_cosine_similarity": 0.171875,
|
| 248 |
+
"std_cosine_similarity": 0.060791015625,
|
| 249 |
+
"mean_l2_distance": 65.0,
|
| 250 |
+
"std_l2_distance": 2.390625,
|
| 251 |
+
"mean_dimension_correlation": 0.15887711457908155,
|
| 252 |
+
"std_dimension_correlation": 0.1775669214564653,
|
| 253 |
+
"linear_cka": 0.97265625
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 1.2265625,
|
| 257 |
+
"mean_cosine_similarity": 0.2421875,
|
| 258 |
+
"std_cosine_similarity": 0.060302734375,
|
| 259 |
+
"mean_l2_distance": 62.25,
|
| 260 |
+
"std_l2_distance": 2.453125,
|
| 261 |
+
"mean_dimension_correlation": 0.22514247596263887,
|
| 262 |
+
"std_dimension_correlation": 0.17232934079449927,
|
| 263 |
+
"linear_cka": 0.98046875
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 1.2421875,
|
| 267 |
+
"mean_cosine_similarity": 0.212890625,
|
| 268 |
+
"std_cosine_similarity": 0.048583984375,
|
| 269 |
+
"mean_l2_distance": 63.5,
|
| 270 |
+
"std_l2_distance": 1.9296875,
|
| 271 |
+
"mean_dimension_correlation": 0.20127090597525238,
|
| 272 |
+
"std_dimension_correlation": 0.16948777576467297,
|
| 273 |
+
"linear_cka": 0.9765625
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 1.2421875,
|
| 277 |
+
"mean_cosine_similarity": 0.216796875,
|
| 278 |
+
"std_cosine_similarity": 0.056884765625,
|
| 279 |
+
"mean_l2_distance": 63.25,
|
| 280 |
+
"std_l2_distance": 2.296875,
|
| 281 |
+
"mean_dimension_correlation": 0.1966501235961914,
|
| 282 |
+
"std_dimension_correlation": 0.1757826105994209,
|
| 283 |
+
"linear_cka": 0.9765625
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 1.2734375,
|
| 287 |
+
"mean_cosine_similarity": 0.1796875,
|
| 288 |
+
"std_cosine_similarity": 0.058349609375,
|
| 289 |
+
"mean_l2_distance": 65.0,
|
| 290 |
+
"std_l2_distance": 2.3125,
|
| 291 |
+
"mean_dimension_correlation": 0.16931568142026662,
|
| 292 |
+
"std_dimension_correlation": 0.17925366106210552,
|
| 293 |
+
"linear_cka": 0.984375
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 1.2890625,
|
| 297 |
+
"mean_cosine_similarity": 0.15625,
|
| 298 |
+
"std_cosine_similarity": 0.048828125,
|
| 299 |
+
"mean_l2_distance": 65.5,
|
| 300 |
+
"std_l2_distance": 1.90625,
|
| 301 |
+
"mean_dimension_correlation": 0.1511871140450239,
|
| 302 |
+
"std_dimension_correlation": 0.17476462218981592,
|
| 303 |
+
"linear_cka": 0.97265625
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 1.234375,
|
| 307 |
+
"mean_cosine_similarity": 0.220703125,
|
| 308 |
+
"std_cosine_similarity": 0.052490234375,
|
| 309 |
+
"mean_l2_distance": 63.25,
|
| 310 |
+
"std_l2_distance": 2.09375,
|
| 311 |
+
"mean_dimension_correlation": 0.2026286849519238,
|
| 312 |
+
"std_dimension_correlation": 0.17765936039692512,
|
| 313 |
+
"linear_cka": 0.9765625
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 1.28125,
|
| 317 |
+
"mean_cosine_similarity": 0.1640625,
|
| 318 |
+
"std_cosine_similarity": 0.051513671875,
|
| 319 |
+
"mean_l2_distance": 65.5,
|
| 320 |
+
"std_l2_distance": 2.03125,
|
| 321 |
+
"mean_dimension_correlation": 0.15531704826280474,
|
| 322 |
+
"std_dimension_correlation": 0.1748249456034516,
|
| 323 |
+
"linear_cka": 0.97265625
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 1.28125,
|
| 327 |
+
"mean_cosine_similarity": 0.162109375,
|
| 328 |
+
"std_cosine_similarity": 0.05517578125,
|
| 329 |
+
"mean_l2_distance": 65.5,
|
| 330 |
+
"std_l2_distance": 2.15625,
|
| 331 |
+
"mean_dimension_correlation": 0.1522526470012963,
|
| 332 |
+
"std_dimension_correlation": 0.17404197310947275,
|
| 333 |
+
"linear_cka": 0.96875
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 1.28125,
|
| 337 |
+
"mean_cosine_similarity": 0.171875,
|
| 338 |
+
"std_cosine_similarity": 0.060791015625,
|
| 339 |
+
"mean_l2_distance": 65.0,
|
| 340 |
+
"std_l2_distance": 2.390625,
|
| 341 |
+
"mean_dimension_correlation": 0.15887909792363644,
|
| 342 |
+
"std_dimension_correlation": 0.1775497121320674,
|
| 343 |
+
"linear_cka": 0.97265625
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 1.2890625,
|
| 347 |
+
"mean_cosine_similarity": 0.15625,
|
| 348 |
+
"std_cosine_similarity": 0.048828125,
|
| 349 |
+
"mean_l2_distance": 65.5,
|
| 350 |
+
"std_l2_distance": 1.90625,
|
| 351 |
+
"mean_dimension_correlation": 0.15117020402103662,
|
| 352 |
+
"std_dimension_correlation": 0.17475352199577512,
|
| 353 |
+
"linear_cka": 0.97265625
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 1.2890625,
|
| 357 |
+
"mean_cosine_similarity": 0.1591796875,
|
| 358 |
+
"std_cosine_similarity": 0.052490234375,
|
| 359 |
+
"mean_l2_distance": 65.5,
|
| 360 |
+
"std_l2_distance": 2.046875,
|
| 361 |
+
"mean_dimension_correlation": 0.14662780333310366,
|
| 362 |
+
"std_dimension_correlation": 0.17160536584318073,
|
| 363 |
+
"linear_cka": 0.9765625
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 1.25,
|
| 367 |
+
"mean_cosine_similarity": 0.203125,
|
| 368 |
+
"std_cosine_similarity": 0.0556640625,
|
| 369 |
+
"mean_l2_distance": 63.75,
|
| 370 |
+
"std_l2_distance": 2.21875,
|
| 371 |
+
"mean_dimension_correlation": 0.19489949941635132,
|
| 372 |
+
"std_dimension_correlation": 0.17601193178810715,
|
| 373 |
+
"linear_cka": 0.98046875
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 1.28125,
|
| 377 |
+
"mean_cosine_similarity": 0.1728515625,
|
| 378 |
+
"std_cosine_similarity": 0.050048828125,
|
| 379 |
+
"mean_l2_distance": 65.0,
|
| 380 |
+
"std_l2_distance": 1.9609375,
|
| 381 |
+
"mean_dimension_correlation": 0.16633510272949933,
|
| 382 |
+
"std_dimension_correlation": 0.17675989044615031,
|
| 383 |
+
"linear_cka": 0.984375
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 1.2265625,
|
| 387 |
+
"mean_cosine_similarity": 0.2421875,
|
| 388 |
+
"std_cosine_similarity": 0.060302734375,
|
| 389 |
+
"mean_l2_distance": 62.25,
|
| 390 |
+
"std_l2_distance": 2.453125,
|
| 391 |
+
"mean_dimension_correlation": 0.22512691989541053,
|
| 392 |
+
"std_dimension_correlation": 0.17232222187158686,
|
| 393 |
+
"linear_cka": 0.98046875
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 1.234375,
|
| 397 |
+
"mean_cosine_similarity": 0.220703125,
|
| 398 |
+
"std_cosine_similarity": 0.052490234375,
|
| 399 |
+
"mean_l2_distance": 63.25,
|
| 400 |
+
"std_l2_distance": 2.09375,
|
| 401 |
+
"mean_dimension_correlation": 0.20264770851936192,
|
| 402 |
+
"std_dimension_correlation": 0.1776996694007629,
|
| 403 |
+
"linear_cka": 0.9765625
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 1.2890625,
|
| 407 |
+
"mean_cosine_similarity": 0.1591796875,
|
| 408 |
+
"std_cosine_similarity": 0.052490234375,
|
| 409 |
+
"mean_l2_distance": 65.5,
|
| 410 |
+
"std_l2_distance": 2.046875,
|
| 411 |
+
"mean_dimension_correlation": 0.14663737285882233,
|
| 412 |
+
"std_dimension_correlation": 0.17162838967231733,
|
| 413 |
+
"linear_cka": 0.9765625
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 1.2625,
|
| 416 |
+
"std_mse": 0.02111784888824301,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.18977864583333334,
|
| 418 |
+
"std_mean_cosine_similarity": 0.026181266453470867,
|
| 419 |
+
"avg_std_cosine_similarity": 0.05514322916666667,
|
| 420 |
+
"std_std_cosine_similarity": 0.004130588740067507,
|
| 421 |
+
"avg_mean_l2_distance": 64.36666666666666,
|
| 422 |
+
"std_mean_l2_distance": 1.0241527663824812,
|
| 423 |
+
"avg_std_l2_distance": 2.1864583333333334,
|
| 424 |
+
"std_std_l2_distance": 0.1757810570986596,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.17837394241786875,
|
| 426 |
+
"std_mean_dimension_correlation": 0.02338007192514464,
|
| 427 |
+
"avg_std_dimension_correlation": 0.17504746274293098,
|
| 428 |
+
"std_std_dimension_correlation": 0.0024286700198214582,
|
| 429 |
+
"avg_linear_cka": 0.9776041666666667,
|
| 430 |
+
"std_linear_cka": 0.004388619673529352
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_2000896.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 2000896,
|
| 4 |
+
"global_step": 1954,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.609375,
|
| 7 |
+
"train/contrastive": 2.5,
|
| 8 |
+
"train/recons_loss": 0.71484375,
|
| 9 |
+
"train/balance_loss": 3.671875,
|
| 10 |
+
"train/balance_loss_contrastive": 2.625,
|
| 11 |
+
"train/balance_loss_recons": 1.046875,
|
| 12 |
+
"train/contrastive_std": 3.234375,
|
| 13 |
+
"train/recons_std": 0.171875,
|
| 14 |
+
"train/contrastive_min": 0.224609375,
|
| 15 |
+
"train/contrastive_max": 7.0,
|
| 16 |
+
"train/recons_min": 0.5859375,
|
| 17 |
+
"train/recons_max": 1.046875,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 1.046875,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.63671875,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.62109375,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.75,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.5859375,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.65234375,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 2000896,
|
| 27 |
+
"train/step": 1954
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 1954,
|
| 31 |
+
"n_tokens": 2000896,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 11.937955856323242,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.9807448387146,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.929330348968506,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.9499993324279785,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.048929214477539,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.740438938140869,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 10.935715675354004,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 3.0637950897216797,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 3.0137126445770264,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.9935271739959717,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 3.0705885887145996,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 3.0900540351867676,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 9.36762523651123,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.4384565353393555,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.606346130371094,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.644039154052734,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.400282859802246,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.326376438140869,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 12.973535537719727,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.9450173377990723,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 3.160464286804199,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.980670928955078,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 3.0805249214172363,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 3.193880319595337,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.039875030517578,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.9839415550231934,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.829629421234131,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.9013402462005615,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.699265241622925,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.6922624111175537,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 7.345184326171875,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.6716532707214355,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.481139659881592,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7798919677734375,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.6100268363952637,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.4427032470703125,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 3.8776168823242188,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.3933213949203491,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.383978009223938,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.3917444944381714,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.38372004032135,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.364012598991394,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 4.228211402893066,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.2656141519546509,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.232149362564087,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.249756097793579,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.2561695575714111,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.2504116296768188,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.011510372161865,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.1570074558258057,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.123882532119751,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.1837798357009888,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.1635627746582031,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.1612880229949951,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 3.041527032852173,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.6266475915908813,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.6221141815185547,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.5699771642684937,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.6569658517837524,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.6420214176177979,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.062718391418457,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.3707829713821411,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.2881925106048584,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.2928853034973145,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.213749885559082,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.2994227409362793,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.752654075622559,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.4750916957855225,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.4292265176773071,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.4410502910614014,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.4525748491287231,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.3758857250213623
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 1.3515625,
|
| 117 |
+
"mean_cosine_similarity": 0.08251953125,
|
| 118 |
+
"std_cosine_similarity": 0.05322265625,
|
| 119 |
+
"mean_l2_distance": 68.5,
|
| 120 |
+
"std_l2_distance": 2.0,
|
| 121 |
+
"mean_dimension_correlation": 0.062478048354387285,
|
| 122 |
+
"std_dimension_correlation": 0.18795068082189698,
|
| 123 |
+
"linear_cka": 0.6640625
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 1.3515625,
|
| 127 |
+
"mean_cosine_similarity": 0.07666015625,
|
| 128 |
+
"std_cosine_similarity": 0.0595703125,
|
| 129 |
+
"mean_l2_distance": 68.5,
|
| 130 |
+
"std_l2_distance": 2.25,
|
| 131 |
+
"mean_dimension_correlation": 0.06342285592108965,
|
| 132 |
+
"std_dimension_correlation": 0.18283416080924633,
|
| 133 |
+
"linear_cka": 0.6640625
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 1.3359375,
|
| 137 |
+
"mean_cosine_similarity": 0.0859375,
|
| 138 |
+
"std_cosine_similarity": 0.057373046875,
|
| 139 |
+
"mean_l2_distance": 68.5,
|
| 140 |
+
"std_l2_distance": 2.171875,
|
| 141 |
+
"mean_dimension_correlation": 0.06745534756919369,
|
| 142 |
+
"std_dimension_correlation": 0.1829785716985573,
|
| 143 |
+
"linear_cka": 0.65234375
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 1.359375,
|
| 147 |
+
"mean_cosine_similarity": 0.0654296875,
|
| 148 |
+
"std_cosine_similarity": 0.051025390625,
|
| 149 |
+
"mean_l2_distance": 69.0,
|
| 150 |
+
"std_l2_distance": 1.9140625,
|
| 151 |
+
"mean_dimension_correlation": 0.0574939165264368,
|
| 152 |
+
"std_dimension_correlation": 0.1857303062299582,
|
| 153 |
+
"linear_cka": 0.66015625
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 1.3515625,
|
| 157 |
+
"mean_cosine_similarity": 0.0703125,
|
| 158 |
+
"std_cosine_similarity": 0.0615234375,
|
| 159 |
+
"mean_l2_distance": 69.0,
|
| 160 |
+
"std_l2_distance": 2.3125,
|
| 161 |
+
"mean_dimension_correlation": 0.060864800889976325,
|
| 162 |
+
"std_dimension_correlation": 0.18444010568801722,
|
| 163 |
+
"linear_cka": 0.66015625
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 1.3515625,
|
| 167 |
+
"mean_cosine_similarity": 0.08251953125,
|
| 168 |
+
"std_cosine_similarity": 0.05322265625,
|
| 169 |
+
"mean_l2_distance": 68.5,
|
| 170 |
+
"std_l2_distance": 2.0,
|
| 171 |
+
"mean_dimension_correlation": 0.062481947243213654,
|
| 172 |
+
"std_dimension_correlation": 0.18797583962876385,
|
| 173 |
+
"linear_cka": 0.6640625
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 1.15625,
|
| 177 |
+
"mean_cosine_similarity": 0.31640625,
|
| 178 |
+
"std_cosine_similarity": 0.10888671875,
|
| 179 |
+
"mean_l2_distance": 59.0,
|
| 180 |
+
"std_l2_distance": 4.84375,
|
| 181 |
+
"mean_dimension_correlation": 0.29400850236415865,
|
| 182 |
+
"std_dimension_correlation": 0.16629643255508064,
|
| 183 |
+
"linear_cka": 0.984375
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 1.140625,
|
| 187 |
+
"mean_cosine_similarity": 0.333984375,
|
| 188 |
+
"std_cosine_similarity": 0.1083984375,
|
| 189 |
+
"mean_l2_distance": 58.25,
|
| 190 |
+
"std_l2_distance": 4.84375,
|
| 191 |
+
"mean_dimension_correlation": 0.3036854453384876,
|
| 192 |
+
"std_dimension_correlation": 0.1640714460889962,
|
| 193 |
+
"linear_cka": 0.98046875
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 1.1953125,
|
| 197 |
+
"mean_cosine_similarity": 0.2734375,
|
| 198 |
+
"std_cosine_similarity": 0.109375,
|
| 199 |
+
"mean_l2_distance": 61.0,
|
| 200 |
+
"std_l2_distance": 4.6875,
|
| 201 |
+
"mean_dimension_correlation": 0.2504168091341853,
|
| 202 |
+
"std_dimension_correlation": 0.16876857548561683,
|
| 203 |
+
"linear_cka": 0.9765625
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 1.1796875,
|
| 207 |
+
"mean_cosine_similarity": 0.2890625,
|
| 208 |
+
"std_cosine_similarity": 0.10791015625,
|
| 209 |
+
"mean_l2_distance": 60.25,
|
| 210 |
+
"std_l2_distance": 4.6875,
|
| 211 |
+
"mean_dimension_correlation": 0.2720076544210315,
|
| 212 |
+
"std_dimension_correlation": 0.16885400186672725,
|
| 213 |
+
"linear_cka": 0.98828125
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 1.3515625,
|
| 217 |
+
"mean_cosine_similarity": 0.07666015625,
|
| 218 |
+
"std_cosine_similarity": 0.0595703125,
|
| 219 |
+
"mean_l2_distance": 68.5,
|
| 220 |
+
"std_l2_distance": 2.25,
|
| 221 |
+
"mean_dimension_correlation": 0.06342689506709576,
|
| 222 |
+
"std_dimension_correlation": 0.18284259799255337,
|
| 223 |
+
"linear_cka": 0.6640625
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 1.15625,
|
| 227 |
+
"mean_cosine_similarity": 0.31640625,
|
| 228 |
+
"std_cosine_similarity": 0.10888671875,
|
| 229 |
+
"mean_l2_distance": 59.0,
|
| 230 |
+
"std_l2_distance": 4.84375,
|
| 231 |
+
"mean_dimension_correlation": 0.29399659037590026,
|
| 232 |
+
"std_dimension_correlation": 0.16628270680485127,
|
| 233 |
+
"linear_cka": 0.984375
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 1.171875,
|
| 237 |
+
"mean_cosine_similarity": 0.302734375,
|
| 238 |
+
"std_cosine_similarity": 0.1162109375,
|
| 239 |
+
"mean_l2_distance": 59.5,
|
| 240 |
+
"std_l2_distance": 5.09375,
|
| 241 |
+
"mean_dimension_correlation": 0.2821845322847366,
|
| 242 |
+
"std_dimension_correlation": 0.1658260527951995,
|
| 243 |
+
"linear_cka": 0.984375
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 1.1875,
|
| 247 |
+
"mean_cosine_similarity": 0.279296875,
|
| 248 |
+
"std_cosine_similarity": 0.1083984375,
|
| 249 |
+
"mean_l2_distance": 60.75,
|
| 250 |
+
"std_l2_distance": 4.6875,
|
| 251 |
+
"mean_dimension_correlation": 0.2564893037080765,
|
| 252 |
+
"std_dimension_correlation": 0.1700593172594885,
|
| 253 |
+
"linear_cka": 0.98828125
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 1.125,
|
| 257 |
+
"mean_cosine_similarity": 0.3515625,
|
| 258 |
+
"std_cosine_similarity": 0.107421875,
|
| 259 |
+
"mean_l2_distance": 57.5,
|
| 260 |
+
"std_l2_distance": 4.875,
|
| 261 |
+
"mean_dimension_correlation": 0.32521353638730943,
|
| 262 |
+
"std_dimension_correlation": 0.16142420298822246,
|
| 263 |
+
"linear_cka": 0.98828125
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 1.3359375,
|
| 267 |
+
"mean_cosine_similarity": 0.0859375,
|
| 268 |
+
"std_cosine_similarity": 0.057373046875,
|
| 269 |
+
"mean_l2_distance": 68.5,
|
| 270 |
+
"std_l2_distance": 2.171875,
|
| 271 |
+
"mean_dimension_correlation": 0.06747776636620983,
|
| 272 |
+
"std_dimension_correlation": 0.18297839209554984,
|
| 273 |
+
"linear_cka": 0.65234375
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 1.140625,
|
| 277 |
+
"mean_cosine_similarity": 0.333984375,
|
| 278 |
+
"std_cosine_similarity": 0.1083984375,
|
| 279 |
+
"mean_l2_distance": 58.25,
|
| 280 |
+
"std_l2_distance": 4.84375,
|
| 281 |
+
"mean_dimension_correlation": 0.3036851711571217,
|
| 282 |
+
"std_dimension_correlation": 0.16406197803154954,
|
| 283 |
+
"linear_cka": 0.98046875
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 1.171875,
|
| 287 |
+
"mean_cosine_similarity": 0.302734375,
|
| 288 |
+
"std_cosine_similarity": 0.1162109375,
|
| 289 |
+
"mean_l2_distance": 59.5,
|
| 290 |
+
"std_l2_distance": 5.09375,
|
| 291 |
+
"mean_dimension_correlation": 0.2821806937456131,
|
| 292 |
+
"std_dimension_correlation": 0.16583393871178487,
|
| 293 |
+
"linear_cka": 0.984375
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 1.1953125,
|
| 297 |
+
"mean_cosine_similarity": 0.267578125,
|
| 298 |
+
"std_cosine_similarity": 0.099609375,
|
| 299 |
+
"mean_l2_distance": 61.0,
|
| 300 |
+
"std_l2_distance": 4.25,
|
| 301 |
+
"mean_dimension_correlation": 0.25347145795822146,
|
| 302 |
+
"std_dimension_correlation": 0.1676427892496719,
|
| 303 |
+
"linear_cka": 0.9765625
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 1.140625,
|
| 307 |
+
"mean_cosine_similarity": 0.3359375,
|
| 308 |
+
"std_cosine_similarity": 0.10302734375,
|
| 309 |
+
"mean_l2_distance": 58.25,
|
| 310 |
+
"std_l2_distance": 4.5625,
|
| 311 |
+
"mean_dimension_correlation": 0.3062414702028036,
|
| 312 |
+
"std_dimension_correlation": 0.16808821448337685,
|
| 313 |
+
"linear_cka": 0.984375
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 1.359375,
|
| 317 |
+
"mean_cosine_similarity": 0.0654296875,
|
| 318 |
+
"std_cosine_similarity": 0.051025390625,
|
| 319 |
+
"mean_l2_distance": 69.0,
|
| 320 |
+
"std_l2_distance": 1.9140625,
|
| 321 |
+
"mean_dimension_correlation": 0.05748535506427288,
|
| 322 |
+
"std_dimension_correlation": 0.18570980538433127,
|
| 323 |
+
"linear_cka": 0.66015625
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 1.1953125,
|
| 327 |
+
"mean_cosine_similarity": 0.2734375,
|
| 328 |
+
"std_cosine_similarity": 0.109375,
|
| 329 |
+
"mean_l2_distance": 61.0,
|
| 330 |
+
"std_l2_distance": 4.6875,
|
| 331 |
+
"mean_dimension_correlation": 0.25039467196911575,
|
| 332 |
+
"std_dimension_correlation": 0.16873641618577465,
|
| 333 |
+
"linear_cka": 0.9765625
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 1.1875,
|
| 337 |
+
"mean_cosine_similarity": 0.279296875,
|
| 338 |
+
"std_cosine_similarity": 0.1083984375,
|
| 339 |
+
"mean_l2_distance": 60.75,
|
| 340 |
+
"std_l2_distance": 4.6875,
|
| 341 |
+
"mean_dimension_correlation": 0.25649560913443564,
|
| 342 |
+
"std_dimension_correlation": 0.17006732480563802,
|
| 343 |
+
"linear_cka": 0.98828125
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 1.1953125,
|
| 347 |
+
"mean_cosine_similarity": 0.267578125,
|
| 348 |
+
"std_cosine_similarity": 0.099609375,
|
| 349 |
+
"mean_l2_distance": 61.0,
|
| 350 |
+
"std_l2_distance": 4.25,
|
| 351 |
+
"mean_dimension_correlation": 0.2534836530685425,
|
| 352 |
+
"std_dimension_correlation": 0.16762209134956418,
|
| 353 |
+
"linear_cka": 0.9765625
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 1.1953125,
|
| 357 |
+
"mean_cosine_similarity": 0.267578125,
|
| 358 |
+
"std_cosine_similarity": 0.1044921875,
|
| 359 |
+
"mean_l2_distance": 61.0,
|
| 360 |
+
"std_l2_distance": 4.46875,
|
| 361 |
+
"mean_dimension_correlation": 0.2458049923181534,
|
| 362 |
+
"std_dimension_correlation": 0.16614846928894367,
|
| 363 |
+
"linear_cka": 0.984375
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 1.3515625,
|
| 367 |
+
"mean_cosine_similarity": 0.0703125,
|
| 368 |
+
"std_cosine_similarity": 0.0615234375,
|
| 369 |
+
"mean_l2_distance": 69.0,
|
| 370 |
+
"std_l2_distance": 2.3125,
|
| 371 |
+
"mean_dimension_correlation": 0.060881674400297923,
|
| 372 |
+
"std_dimension_correlation": 0.18442433029309818,
|
| 373 |
+
"linear_cka": 0.66015625
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 1.1796875,
|
| 377 |
+
"mean_cosine_similarity": 0.2890625,
|
| 378 |
+
"std_cosine_similarity": 0.10791015625,
|
| 379 |
+
"mean_l2_distance": 60.25,
|
| 380 |
+
"std_l2_distance": 4.6875,
|
| 381 |
+
"mean_dimension_correlation": 0.27202143501490356,
|
| 382 |
+
"std_dimension_correlation": 0.16887910421184668,
|
| 383 |
+
"linear_cka": 0.98828125
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 1.125,
|
| 387 |
+
"mean_cosine_similarity": 0.3515625,
|
| 388 |
+
"std_cosine_similarity": 0.107421875,
|
| 389 |
+
"mean_l2_distance": 57.5,
|
| 390 |
+
"std_l2_distance": 4.875,
|
| 391 |
+
"mean_dimension_correlation": 0.32520583919249474,
|
| 392 |
+
"std_dimension_correlation": 0.16142301505759077,
|
| 393 |
+
"linear_cka": 0.98828125
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 1.140625,
|
| 397 |
+
"mean_cosine_similarity": 0.3359375,
|
| 398 |
+
"std_cosine_similarity": 0.10302734375,
|
| 399 |
+
"mean_l2_distance": 58.25,
|
| 400 |
+
"std_l2_distance": 4.5625,
|
| 401 |
+
"mean_dimension_correlation": 0.30626075267791747,
|
| 402 |
+
"std_dimension_correlation": 0.16814174967857434,
|
| 403 |
+
"linear_cka": 0.984375
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 1.1953125,
|
| 407 |
+
"mean_cosine_similarity": 0.267578125,
|
| 408 |
+
"std_cosine_similarity": 0.1044921875,
|
| 409 |
+
"mean_l2_distance": 61.0,
|
| 410 |
+
"std_l2_distance": 4.46875,
|
| 411 |
+
"mean_dimension_correlation": 0.24580164328217508,
|
| 412 |
+
"std_dimension_correlation": 0.16616246993231076,
|
| 413 |
+
"linear_cka": 0.984375
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 1.2291666666666667,
|
| 416 |
+
"std_mse": 0.08795763263576896,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.2265625,
|
| 418 |
+
"std_mean_cosine_similarity": 0.10912461458526285,
|
| 419 |
+
"avg_std_cosine_similarity": 0.0904296875,
|
| 420 |
+
"std_std_cosine_similarity": 0.02430735142446207,
|
| 421 |
+
"avg_mean_l2_distance": 62.666666666666664,
|
| 422 |
+
"std_mean_l2_distance": 4.391911757866827,
|
| 423 |
+
"avg_std_l2_distance": 3.8432291666666667,
|
| 424 |
+
"std_std_l2_distance": 1.2284684192595492,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.20675061237125192,
|
| 426 |
+
"std_mean_dimension_correlation": 0.10433772738239133,
|
| 427 |
+
"avg_std_dimension_correlation": 0.17274183624909276,
|
| 428 |
+
"std_std_dimension_correlation": 0.008813959193590607,
|
| 429 |
+
"avg_linear_cka": 0.87578125,
|
| 430 |
+
"std_linear_cka": 0.15252860046015415
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_3001344.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 3001344,
|
| 4 |
+
"global_step": 2931,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.515625,
|
| 7 |
+
"train/contrastive": 2.421875,
|
| 8 |
+
"train/recons_loss": 0.671875,
|
| 9 |
+
"train/balance_loss": 3.75,
|
| 10 |
+
"train/balance_loss_contrastive": 2.71875,
|
| 11 |
+
"train/balance_loss_recons": 1.0390625,
|
| 12 |
+
"train/contrastive_std": 3.25,
|
| 13 |
+
"train/recons_std": 0.138671875,
|
| 14 |
+
"train/contrastive_min": 0.146484375,
|
| 15 |
+
"train/contrastive_max": 6.9375,
|
| 16 |
+
"train/recons_min": 0.56640625,
|
| 17 |
+
"train/recons_max": 0.9375,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.9375,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.59765625,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.59375,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.703125,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.56640625,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.6328125,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 3001344,
|
| 27 |
+
"train/step": 2931
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 2931,
|
| 31 |
+
"n_tokens": 3001344,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 11.318835258483887,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 9.138021469116211,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.61973762512207,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 9.007281303405762,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.960853576660156,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.789403915405273,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 7.3046698570251465,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.55082368850708,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.5602962970733643,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.592942714691162,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.588857650756836,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.6625943183898926,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.131369590759277,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.891963481903076,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.430274963378906,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.0684638023376465,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 5.9689507484436035,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.356847286224365,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 8.19615364074707,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.8310694694519043,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.7546491622924805,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.7474663257598877,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.857220411300659,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.925436019897461,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.565979957580566,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.9663586616516113,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.719478130340576,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.741952657699585,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.7755935192108154,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.7375831604003906,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 7.4653778076171875,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.67035174369812,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.566011905670166,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7160496711730957,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.552424907684326,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.459855556488037,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 15.485275268554688,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 22.359243392944336,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 21.841341018676758,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 20.851577758789062,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 23.41849136352539,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 22.13389015197754,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 8.68209457397461,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0910885334014893,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0663363933563232,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.1295608282089233,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.096497654914856,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0976781845092773,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.745889663696289,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.073387622833252,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0651912689208984,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.1097475290298462,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.102055311203003,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.1042507886886597,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.085488319396973,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.443469762802124,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.407573938369751,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.394163966178894,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.4274914264678955,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.423639178276062,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.723683834075928,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.2199777364730835,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.1646456718444824,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1640838384628296,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.1155877113342285,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.1568272113800049,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.499415397644043,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2985743284225464,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2958557605743408,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2903549671173096,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2823046445846558,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2616506814956665
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 1.421875,
|
| 117 |
+
"mean_cosine_similarity": -0.03369140625,
|
| 118 |
+
"std_cosine_similarity": 0.109375,
|
| 119 |
+
"mean_l2_distance": 72.5,
|
| 120 |
+
"std_l2_distance": 3.90625,
|
| 121 |
+
"mean_dimension_correlation": 0.254237837344408,
|
| 122 |
+
"std_dimension_correlation": 0.16181929189675745,
|
| 123 |
+
"linear_cka": 0.5859375
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 1.421875,
|
| 127 |
+
"mean_cosine_similarity": -0.0284423828125,
|
| 128 |
+
"std_cosine_similarity": 0.10888671875,
|
| 129 |
+
"mean_l2_distance": 72.5,
|
| 130 |
+
"std_l2_distance": 3.890625,
|
| 131 |
+
"mean_dimension_correlation": 0.25683254674077033,
|
| 132 |
+
"std_dimension_correlation": 0.16029215327593901,
|
| 133 |
+
"linear_cka": 0.57421875
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 1.4140625,
|
| 137 |
+
"mean_cosine_similarity": -0.0252685546875,
|
| 138 |
+
"std_cosine_similarity": 0.1083984375,
|
| 139 |
+
"mean_l2_distance": 72.5,
|
| 140 |
+
"std_l2_distance": 3.875,
|
| 141 |
+
"mean_dimension_correlation": 0.25395019352436066,
|
| 142 |
+
"std_dimension_correlation": 0.15926056622745546,
|
| 143 |
+
"linear_cka": 0.578125
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 1.421875,
|
| 147 |
+
"mean_cosine_similarity": -0.03271484375,
|
| 148 |
+
"std_cosine_similarity": 0.1064453125,
|
| 149 |
+
"mean_l2_distance": 72.5,
|
| 150 |
+
"std_l2_distance": 3.796875,
|
| 151 |
+
"mean_dimension_correlation": 0.24886183738708495,
|
| 152 |
+
"std_dimension_correlation": 0.15849261736593726,
|
| 153 |
+
"linear_cka": 0.55859375
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 1.421875,
|
| 157 |
+
"mean_cosine_similarity": -0.033203125,
|
| 158 |
+
"std_cosine_similarity": 0.109375,
|
| 159 |
+
"mean_l2_distance": 72.5,
|
| 160 |
+
"std_l2_distance": 3.890625,
|
| 161 |
+
"mean_dimension_correlation": 0.256584095954895,
|
| 162 |
+
"std_dimension_correlation": 0.15873214025442897,
|
| 163 |
+
"linear_cka": 0.57421875
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 1.4296875,
|
| 167 |
+
"mean_cosine_similarity": -0.03369140625,
|
| 168 |
+
"std_cosine_similarity": 0.109375,
|
| 169 |
+
"mean_l2_distance": 72.5,
|
| 170 |
+
"std_l2_distance": 3.90625,
|
| 171 |
+
"mean_dimension_correlation": 0.2542317323386669,
|
| 172 |
+
"std_dimension_correlation": 0.16183266276519212,
|
| 173 |
+
"linear_cka": 0.5859375
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 0.734375,
|
| 177 |
+
"mean_cosine_similarity": 0.65625,
|
| 178 |
+
"std_cosine_similarity": 0.28515625,
|
| 179 |
+
"mean_l2_distance": 37.25,
|
| 180 |
+
"std_l2_distance": 19.5,
|
| 181 |
+
"mean_dimension_correlation": 0.6187647342681885,
|
| 182 |
+
"std_dimension_correlation": 0.11470426666838326,
|
| 183 |
+
"linear_cka": 0.984375
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 0.7265625,
|
| 187 |
+
"mean_cosine_similarity": 0.66015625,
|
| 188 |
+
"std_cosine_similarity": 0.279296875,
|
| 189 |
+
"mean_l2_distance": 37.0,
|
| 190 |
+
"std_l2_distance": 19.25,
|
| 191 |
+
"mean_dimension_correlation": 0.6220208525657653,
|
| 192 |
+
"std_dimension_correlation": 0.11040039509848326,
|
| 193 |
+
"linear_cka": 0.984375
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 0.76171875,
|
| 197 |
+
"mean_cosine_similarity": 0.62890625,
|
| 198 |
+
"std_cosine_similarity": 0.302734375,
|
| 199 |
+
"mean_l2_distance": 38.75,
|
| 200 |
+
"std_l2_distance": 20.125,
|
| 201 |
+
"mean_dimension_correlation": 0.592758321762085,
|
| 202 |
+
"std_dimension_correlation": 0.11886540241980308,
|
| 203 |
+
"linear_cka": 0.98046875
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 0.74609375,
|
| 207 |
+
"mean_cosine_similarity": 0.63671875,
|
| 208 |
+
"std_cosine_similarity": 0.302734375,
|
| 209 |
+
"mean_l2_distance": 38.25,
|
| 210 |
+
"std_l2_distance": 20.25,
|
| 211 |
+
"mean_dimension_correlation": 0.6037769317626953,
|
| 212 |
+
"std_dimension_correlation": 0.11647753822253991,
|
| 213 |
+
"linear_cka": 0.984375
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 1.4296875,
|
| 217 |
+
"mean_cosine_similarity": -0.0284423828125,
|
| 218 |
+
"std_cosine_similarity": 0.10888671875,
|
| 219 |
+
"mean_l2_distance": 72.5,
|
| 220 |
+
"std_l2_distance": 3.890625,
|
| 221 |
+
"mean_dimension_correlation": 0.25684744566679,
|
| 222 |
+
"std_dimension_correlation": 0.16032274573798164,
|
| 223 |
+
"linear_cka": 0.57421875
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 0.734375,
|
| 227 |
+
"mean_cosine_similarity": 0.65625,
|
| 228 |
+
"std_cosine_similarity": 0.28515625,
|
| 229 |
+
"mean_l2_distance": 37.25,
|
| 230 |
+
"std_l2_distance": 19.5,
|
| 231 |
+
"mean_dimension_correlation": 0.6187384128570557,
|
| 232 |
+
"std_dimension_correlation": 0.11471572089316741,
|
| 233 |
+
"linear_cka": 0.984375
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 0.734375,
|
| 237 |
+
"mean_cosine_similarity": 0.6484375,
|
| 238 |
+
"std_cosine_similarity": 0.30078125,
|
| 239 |
+
"mean_l2_distance": 37.5,
|
| 240 |
+
"std_l2_distance": 20.375,
|
| 241 |
+
"mean_dimension_correlation": 0.6119367599487304,
|
| 242 |
+
"std_dimension_correlation": 0.1157440646478159,
|
| 243 |
+
"linear_cka": 0.99609375
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 0.75390625,
|
| 247 |
+
"mean_cosine_similarity": 0.63671875,
|
| 248 |
+
"std_cosine_similarity": 0.30078125,
|
| 249 |
+
"mean_l2_distance": 38.25,
|
| 250 |
+
"std_l2_distance": 20.375,
|
| 251 |
+
"mean_dimension_correlation": 0.5996460914611816,
|
| 252 |
+
"std_dimension_correlation": 0.11944124129277625,
|
| 253 |
+
"linear_cka": 0.98046875
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 0.70703125,
|
| 257 |
+
"mean_cosine_similarity": 0.67578125,
|
| 258 |
+
"std_cosine_similarity": 0.27734375,
|
| 259 |
+
"mean_l2_distance": 36.0,
|
| 260 |
+
"std_l2_distance": 19.5,
|
| 261 |
+
"mean_dimension_correlation": 0.638215160369873,
|
| 262 |
+
"std_dimension_correlation": 0.10975611081697591,
|
| 263 |
+
"linear_cka": 0.984375
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 1.4140625,
|
| 267 |
+
"mean_cosine_similarity": -0.0252685546875,
|
| 268 |
+
"std_cosine_similarity": 0.1083984375,
|
| 269 |
+
"mean_l2_distance": 72.5,
|
| 270 |
+
"std_l2_distance": 3.875,
|
| 271 |
+
"mean_dimension_correlation": 0.25395837128162385,
|
| 272 |
+
"std_dimension_correlation": 0.15926372552177567,
|
| 273 |
+
"linear_cka": 0.578125
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 0.7265625,
|
| 277 |
+
"mean_cosine_similarity": 0.66015625,
|
| 278 |
+
"std_cosine_similarity": 0.279296875,
|
| 279 |
+
"mean_l2_distance": 37.0,
|
| 280 |
+
"std_l2_distance": 19.25,
|
| 281 |
+
"mean_dimension_correlation": 0.6219659209251404,
|
| 282 |
+
"std_dimension_correlation": 0.11032863879333923,
|
| 283 |
+
"linear_cka": 0.984375
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 0.734375,
|
| 287 |
+
"mean_cosine_similarity": 0.6484375,
|
| 288 |
+
"std_cosine_similarity": 0.30078125,
|
| 289 |
+
"mean_l2_distance": 37.5,
|
| 290 |
+
"std_l2_distance": 20.375,
|
| 291 |
+
"mean_dimension_correlation": 0.6119108200073242,
|
| 292 |
+
"std_dimension_correlation": 0.1157383378132106,
|
| 293 |
+
"linear_cka": 0.99609375
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 0.7578125,
|
| 297 |
+
"mean_cosine_similarity": 0.6328125,
|
| 298 |
+
"std_cosine_similarity": 0.298828125,
|
| 299 |
+
"mean_l2_distance": 38.5,
|
| 300 |
+
"std_l2_distance": 20.125,
|
| 301 |
+
"mean_dimension_correlation": 0.5979020118713378,
|
| 302 |
+
"std_dimension_correlation": 0.1151705814719715,
|
| 303 |
+
"linear_cka": 0.984375
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 0.71875,
|
| 307 |
+
"mean_cosine_similarity": 0.6640625,
|
| 308 |
+
"std_cosine_similarity": 0.28125,
|
| 309 |
+
"mean_l2_distance": 36.75,
|
| 310 |
+
"std_l2_distance": 19.5,
|
| 311 |
+
"mean_dimension_correlation": 0.6274345874786377,
|
| 312 |
+
"std_dimension_correlation": 0.11253210388812478,
|
| 313 |
+
"linear_cka": 0.98046875
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 1.4296875,
|
| 317 |
+
"mean_cosine_similarity": -0.03271484375,
|
| 318 |
+
"std_cosine_similarity": 0.1064453125,
|
| 319 |
+
"mean_l2_distance": 72.5,
|
| 320 |
+
"std_l2_distance": 3.796875,
|
| 321 |
+
"mean_dimension_correlation": 0.24887723177671434,
|
| 322 |
+
"std_dimension_correlation": 0.15850834600861563,
|
| 323 |
+
"linear_cka": 0.55859375
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 0.76171875,
|
| 327 |
+
"mean_cosine_similarity": 0.62890625,
|
| 328 |
+
"std_cosine_similarity": 0.302734375,
|
| 329 |
+
"mean_l2_distance": 38.75,
|
| 330 |
+
"std_l2_distance": 20.125,
|
| 331 |
+
"mean_dimension_correlation": 0.5927883148193359,
|
| 332 |
+
"std_dimension_correlation": 0.11887853166661289,
|
| 333 |
+
"linear_cka": 0.98046875
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 0.75390625,
|
| 337 |
+
"mean_cosine_similarity": 0.63671875,
|
| 338 |
+
"std_cosine_similarity": 0.30078125,
|
| 339 |
+
"mean_l2_distance": 38.25,
|
| 340 |
+
"std_l2_distance": 20.375,
|
| 341 |
+
"mean_dimension_correlation": 0.5995779991149902,
|
| 342 |
+
"std_dimension_correlation": 0.1193691675179003,
|
| 343 |
+
"linear_cka": 0.98046875
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 0.7578125,
|
| 347 |
+
"mean_cosine_similarity": 0.6328125,
|
| 348 |
+
"std_cosine_similarity": 0.298828125,
|
| 349 |
+
"mean_l2_distance": 38.5,
|
| 350 |
+
"std_l2_distance": 20.125,
|
| 351 |
+
"mean_dimension_correlation": 0.5978128433227539,
|
| 352 |
+
"std_dimension_correlation": 0.11512506347641102,
|
| 353 |
+
"linear_cka": 0.984375
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 0.75390625,
|
| 357 |
+
"mean_cosine_similarity": 0.62890625,
|
| 358 |
+
"std_cosine_similarity": 0.3046875,
|
| 359 |
+
"mean_l2_distance": 38.5,
|
| 360 |
+
"std_l2_distance": 20.625,
|
| 361 |
+
"mean_dimension_correlation": 0.5955796241760254,
|
| 362 |
+
"std_dimension_correlation": 0.11906185378925987,
|
| 363 |
+
"linear_cka": 0.98828125
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 1.4296875,
|
| 367 |
+
"mean_cosine_similarity": -0.033203125,
|
| 368 |
+
"std_cosine_similarity": 0.109375,
|
| 369 |
+
"mean_l2_distance": 72.5,
|
| 370 |
+
"std_l2_distance": 3.890625,
|
| 371 |
+
"mean_dimension_correlation": 0.2565764158964157,
|
| 372 |
+
"std_dimension_correlation": 0.1587071816624074,
|
| 373 |
+
"linear_cka": 0.57421875
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 0.74609375,
|
| 377 |
+
"mean_cosine_similarity": 0.63671875,
|
| 378 |
+
"std_cosine_similarity": 0.302734375,
|
| 379 |
+
"mean_l2_distance": 38.25,
|
| 380 |
+
"std_l2_distance": 20.25,
|
| 381 |
+
"mean_dimension_correlation": 0.6037120819091797,
|
| 382 |
+
"std_dimension_correlation": 0.11639985412027169,
|
| 383 |
+
"linear_cka": 0.984375
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 0.70703125,
|
| 387 |
+
"mean_cosine_similarity": 0.67578125,
|
| 388 |
+
"std_cosine_similarity": 0.27734375,
|
| 389 |
+
"mean_l2_distance": 36.0,
|
| 390 |
+
"std_l2_distance": 19.5,
|
| 391 |
+
"mean_dimension_correlation": 0.6382188320159912,
|
| 392 |
+
"std_dimension_correlation": 0.10972459865917429,
|
| 393 |
+
"linear_cka": 0.984375
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 0.71875,
|
| 397 |
+
"mean_cosine_similarity": 0.6640625,
|
| 398 |
+
"std_cosine_similarity": 0.28125,
|
| 399 |
+
"mean_l2_distance": 36.75,
|
| 400 |
+
"std_l2_distance": 19.5,
|
| 401 |
+
"mean_dimension_correlation": 0.6273346900939941,
|
| 402 |
+
"std_dimension_correlation": 0.1124933006393999,
|
| 403 |
+
"linear_cka": 0.98046875
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 0.75390625,
|
| 407 |
+
"mean_cosine_similarity": 0.62890625,
|
| 408 |
+
"std_cosine_similarity": 0.3046875,
|
| 409 |
+
"mean_l2_distance": 38.5,
|
| 410 |
+
"std_l2_distance": 20.625,
|
| 411 |
+
"mean_dimension_correlation": 0.5955384254455567,
|
| 412 |
+
"std_dimension_correlation": 0.11905433194805992,
|
| 413 |
+
"linear_cka": 0.98828125
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 0.9674479166666666,
|
| 416 |
+
"std_mse": 0.32276016873074226,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.4210286458333333,
|
| 418 |
+
"std_mean_cosine_similarity": 0.31965592389258923,
|
| 419 |
+
"avg_std_cosine_similarity": 0.23173828125,
|
| 420 |
+
"std_std_cosine_similarity": 0.08757208624967457,
|
| 421 |
+
"avg_mean_l2_distance": 49.28333333333333,
|
| 422 |
+
"std_mean_l2_distance": 16.43189648890907,
|
| 423 |
+
"avg_std_l2_distance": 14.598958333333334,
|
| 424 |
+
"std_std_l2_distance": 7.594291533045653,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.49188637080291897,
|
| 426 |
+
"std_mean_dimension_correlation": 0.16857047305704442,
|
| 427 |
+
"avg_std_dimension_correlation": 0.1300404178186724,
|
| 428 |
+
"std_std_dimension_correlation": 0.021172306029780062,
|
| 429 |
+
"avg_linear_cka": 0.8479166666666667,
|
| 430 |
+
"std_linear_cka": 0.19363585907609904
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_4001792.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 4001792,
|
| 4 |
+
"global_step": 3908,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.5,
|
| 7 |
+
"train/contrastive": 2.40625,
|
| 8 |
+
"train/recons_loss": 0.671875,
|
| 9 |
+
"train/balance_loss": 3.828125,
|
| 10 |
+
"train/balance_loss_contrastive": 2.78125,
|
| 11 |
+
"train/balance_loss_recons": 1.046875,
|
| 12 |
+
"train/contrastive_std": 3.265625,
|
| 13 |
+
"train/recons_std": 0.1513671875,
|
| 14 |
+
"train/contrastive_min": 0.1162109375,
|
| 15 |
+
"train/contrastive_max": 6.9375,
|
| 16 |
+
"train/recons_min": 0.55859375,
|
| 17 |
+
"train/recons_max": 0.96484375,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.96484375,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.6015625,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.58203125,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.69140625,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.55859375,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.625,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 4001792,
|
| 27 |
+
"train/step": 3908
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 3908,
|
| 31 |
+
"n_tokens": 4001792,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 10.677733421325684,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 10.070417404174805,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 10.500988960266113,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 10.254755973815918,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 10.141581535339355,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 10.209218978881836,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 6.944426536560059,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.526094675064087,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.448215961456299,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.5273706912994385,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.518568515777588,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.5949084758758545,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.105497360229492,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.14721155166626,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.3550543785095215,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.340244293212891,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.490333557128906,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.536875247955322,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 9.303935050964355,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.7055323123931885,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6092498302459717,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.6337990760803223,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.66951322555542,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.742098569869995,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 8.335909843444824,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.7109298706054688,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.387141704559326,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.432076930999756,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.466850519180298,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3831467628479004,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 8.078448295593262,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.5085513591766357,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.5161335468292236,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7921173572540283,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.47990345954895,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.2069830894470215,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 9.193017959594727,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 5.069667816162109,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.365467071533203,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 5.159329414367676,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 5.38785982131958,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 5.0614728927612305,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 5.8135833740234375,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0534567832946777,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0686990022659302,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.093401312828064,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.055011510848999,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0553429126739502,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.750565528869629,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.014384150505066,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0321710109710693,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.048608422279358,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.0492010116577148,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.053753137588501,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 5.545437812805176,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.3332059383392334,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.340571641921997,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.3302825689315796,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.3458808660507202,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.3539550304412842,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.418992042541504,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.1341036558151245,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.0986826419830322,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1034446954727173,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.0604543685913086,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.0890880823135376,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.221832275390625,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2409595251083374,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2512269020080566,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2450522184371948,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2315609455108643,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2183749675750732
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 1.3515625,
|
| 117 |
+
"mean_cosine_similarity": 0.052001953125,
|
| 118 |
+
"std_cosine_similarity": 0.19140625,
|
| 119 |
+
"mean_l2_distance": 69.5,
|
| 120 |
+
"std_l2_distance": 7.3125,
|
| 121 |
+
"mean_dimension_correlation": 0.46465563774108887,
|
| 122 |
+
"std_dimension_correlation": 0.1362067287833464,
|
| 123 |
+
"linear_cka": 0.5859375
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 1.34375,
|
| 127 |
+
"mean_cosine_similarity": 0.056396484375,
|
| 128 |
+
"std_cosine_similarity": 0.19140625,
|
| 129 |
+
"mean_l2_distance": 69.0,
|
| 130 |
+
"std_l2_distance": 7.34375,
|
| 131 |
+
"mean_dimension_correlation": 0.46726187616586684,
|
| 132 |
+
"std_dimension_correlation": 0.13268670396178475,
|
| 133 |
+
"linear_cka": 0.578125
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 1.34375,
|
| 137 |
+
"mean_cosine_similarity": 0.055908203125,
|
| 138 |
+
"std_cosine_similarity": 0.1904296875,
|
| 139 |
+
"mean_l2_distance": 69.0,
|
| 140 |
+
"std_l2_distance": 7.34375,
|
| 141 |
+
"mean_dimension_correlation": 0.4647917509078979,
|
| 142 |
+
"std_dimension_correlation": 0.13471787446655337,
|
| 143 |
+
"linear_cka": 0.578125
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 1.3515625,
|
| 147 |
+
"mean_cosine_similarity": 0.056640625,
|
| 148 |
+
"std_cosine_similarity": 0.1884765625,
|
| 149 |
+
"mean_l2_distance": 69.0,
|
| 150 |
+
"std_l2_distance": 7.21875,
|
| 151 |
+
"mean_dimension_correlation": 0.4657045602798462,
|
| 152 |
+
"std_dimension_correlation": 0.13361017606712636,
|
| 153 |
+
"linear_cka": 0.57421875
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 1.34375,
|
| 157 |
+
"mean_cosine_similarity": 0.05615234375,
|
| 158 |
+
"std_cosine_similarity": 0.1904296875,
|
| 159 |
+
"mean_l2_distance": 69.0,
|
| 160 |
+
"std_l2_distance": 7.3125,
|
| 161 |
+
"mean_dimension_correlation": 0.4670211374759674,
|
| 162 |
+
"std_dimension_correlation": 0.13379253598505308,
|
| 163 |
+
"linear_cka": 0.57421875
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 1.359375,
|
| 167 |
+
"mean_cosine_similarity": 0.052001953125,
|
| 168 |
+
"std_cosine_similarity": 0.19140625,
|
| 169 |
+
"mean_l2_distance": 69.5,
|
| 170 |
+
"std_l2_distance": 7.3125,
|
| 171 |
+
"mean_dimension_correlation": 0.4646653652191162,
|
| 172 |
+
"std_dimension_correlation": 0.13617385784126732,
|
| 173 |
+
"linear_cka": 0.5859375
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 0.5,
|
| 177 |
+
"mean_cosine_similarity": 0.8046875,
|
| 178 |
+
"std_cosine_similarity": 0.255859375,
|
| 179 |
+
"mean_l2_distance": 25.5,
|
| 180 |
+
"std_l2_distance": 19.375,
|
| 181 |
+
"mean_dimension_correlation": 0.7786048889160156,
|
| 182 |
+
"std_dimension_correlation": 0.079747066045607,
|
| 183 |
+
"linear_cka": 0.984375
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 0.498046875,
|
| 187 |
+
"mean_cosine_similarity": 0.8046875,
|
| 188 |
+
"std_cosine_similarity": 0.25390625,
|
| 189 |
+
"mean_l2_distance": 25.375,
|
| 190 |
+
"std_l2_distance": 19.25,
|
| 191 |
+
"mean_dimension_correlation": 0.7792343139648438,
|
| 192 |
+
"std_dimension_correlation": 0.07860444177664169,
|
| 193 |
+
"linear_cka": 0.98828125
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 0.51953125,
|
| 197 |
+
"mean_cosine_similarity": 0.78515625,
|
| 198 |
+
"std_cosine_similarity": 0.275390625,
|
| 199 |
+
"mean_l2_distance": 26.5,
|
| 200 |
+
"std_l2_distance": 20.25,
|
| 201 |
+
"mean_dimension_correlation": 0.7631843566894532,
|
| 202 |
+
"std_dimension_correlation": 0.08458475161101357,
|
| 203 |
+
"linear_cka": 0.98828125
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 0.5078125,
|
| 207 |
+
"mean_cosine_similarity": 0.79296875,
|
| 208 |
+
"std_cosine_similarity": 0.2734375,
|
| 209 |
+
"mean_l2_distance": 26.0,
|
| 210 |
+
"std_l2_distance": 20.125,
|
| 211 |
+
"mean_dimension_correlation": 0.7681190490722656,
|
| 212 |
+
"std_dimension_correlation": 0.08387350384855204,
|
| 213 |
+
"linear_cka": 0.984375
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 1.3515625,
|
| 217 |
+
"mean_cosine_similarity": 0.056396484375,
|
| 218 |
+
"std_cosine_similarity": 0.19140625,
|
| 219 |
+
"mean_l2_distance": 69.0,
|
| 220 |
+
"std_l2_distance": 7.34375,
|
| 221 |
+
"mean_dimension_correlation": 0.46729940325021746,
|
| 222 |
+
"std_dimension_correlation": 0.13270665905666312,
|
| 223 |
+
"linear_cka": 0.578125
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 0.5,
|
| 227 |
+
"mean_cosine_similarity": 0.8046875,
|
| 228 |
+
"std_cosine_similarity": 0.255859375,
|
| 229 |
+
"mean_l2_distance": 25.5,
|
| 230 |
+
"std_l2_distance": 19.375,
|
| 231 |
+
"mean_dimension_correlation": 0.7785774230957031,
|
| 232 |
+
"std_dimension_correlation": 0.07977299719796638,
|
| 233 |
+
"linear_cka": 0.984375
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 0.49609375,
|
| 237 |
+
"mean_cosine_similarity": 0.796875,
|
| 238 |
+
"std_cosine_similarity": 0.2734375,
|
| 239 |
+
"mean_l2_distance": 25.25,
|
| 240 |
+
"std_l2_distance": 20.375,
|
| 241 |
+
"mean_dimension_correlation": 0.7738082885742188,
|
| 242 |
+
"std_dimension_correlation": 0.08244148252527034,
|
| 243 |
+
"linear_cka": 0.98828125
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 0.50390625,
|
| 247 |
+
"mean_cosine_similarity": 0.79296875,
|
| 248 |
+
"std_cosine_similarity": 0.275390625,
|
| 249 |
+
"mean_l2_distance": 25.75,
|
| 250 |
+
"std_l2_distance": 20.5,
|
| 251 |
+
"mean_dimension_correlation": 0.7685455322265625,
|
| 252 |
+
"std_dimension_correlation": 0.08636337823761847,
|
| 253 |
+
"linear_cka": 0.98828125
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 0.48046875,
|
| 257 |
+
"mean_cosine_similarity": 0.8125,
|
| 258 |
+
"std_cosine_similarity": 0.251953125,
|
| 259 |
+
"mean_l2_distance": 24.5,
|
| 260 |
+
"std_l2_distance": 19.375,
|
| 261 |
+
"mean_dimension_correlation": 0.789703369140625,
|
| 262 |
+
"std_dimension_correlation": 0.07704774466213117,
|
| 263 |
+
"linear_cka": 0.984375
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 1.34375,
|
| 267 |
+
"mean_cosine_similarity": 0.055908203125,
|
| 268 |
+
"std_cosine_similarity": 0.1904296875,
|
| 269 |
+
"mean_l2_distance": 69.0,
|
| 270 |
+
"std_l2_distance": 7.34375,
|
| 271 |
+
"mean_dimension_correlation": 0.4647957801818848,
|
| 272 |
+
"std_dimension_correlation": 0.1347461643666133,
|
| 273 |
+
"linear_cka": 0.578125
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 0.49609375,
|
| 277 |
+
"mean_cosine_similarity": 0.8046875,
|
| 278 |
+
"std_cosine_similarity": 0.25390625,
|
| 279 |
+
"mean_l2_distance": 25.375,
|
| 280 |
+
"std_l2_distance": 19.25,
|
| 281 |
+
"mean_dimension_correlation": 0.779193115234375,
|
| 282 |
+
"std_dimension_correlation": 0.07862977772942846,
|
| 283 |
+
"linear_cka": 0.98828125
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 0.4921875,
|
| 287 |
+
"mean_cosine_similarity": 0.796875,
|
| 288 |
+
"std_cosine_similarity": 0.2734375,
|
| 289 |
+
"mean_l2_distance": 25.25,
|
| 290 |
+
"std_l2_distance": 20.375,
|
| 291 |
+
"mean_dimension_correlation": 0.773846435546875,
|
| 292 |
+
"std_dimension_correlation": 0.08246401911605972,
|
| 293 |
+
"linear_cka": 0.98828125
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 0.5078125,
|
| 297 |
+
"mean_cosine_similarity": 0.79296875,
|
| 298 |
+
"std_cosine_similarity": 0.271484375,
|
| 299 |
+
"mean_l2_distance": 26.0,
|
| 300 |
+
"std_l2_distance": 20.125,
|
| 301 |
+
"mean_dimension_correlation": 0.7682723999023438,
|
| 302 |
+
"std_dimension_correlation": 0.08173679476643078,
|
| 303 |
+
"linear_cka": 0.984375
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 0.486328125,
|
| 307 |
+
"mean_cosine_similarity": 0.80859375,
|
| 308 |
+
"std_cosine_similarity": 0.255859375,
|
| 309 |
+
"mean_l2_distance": 24.875,
|
| 310 |
+
"std_l2_distance": 19.5,
|
| 311 |
+
"mean_dimension_correlation": 0.7830284118652344,
|
| 312 |
+
"std_dimension_correlation": 0.07756386958443834,
|
| 313 |
+
"linear_cka": 0.98046875
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 1.359375,
|
| 317 |
+
"mean_cosine_similarity": 0.056640625,
|
| 318 |
+
"std_cosine_similarity": 0.1884765625,
|
| 319 |
+
"mean_l2_distance": 69.0,
|
| 320 |
+
"std_l2_distance": 7.21875,
|
| 321 |
+
"mean_dimension_correlation": 0.46567630767822266,
|
| 322 |
+
"std_dimension_correlation": 0.13364195702919346,
|
| 323 |
+
"linear_cka": 0.57421875
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 0.51953125,
|
| 327 |
+
"mean_cosine_similarity": 0.78515625,
|
| 328 |
+
"std_cosine_similarity": 0.275390625,
|
| 329 |
+
"mean_l2_distance": 26.5,
|
| 330 |
+
"std_l2_distance": 20.25,
|
| 331 |
+
"mean_dimension_correlation": 0.7631195068359375,
|
| 332 |
+
"std_dimension_correlation": 0.08451723229099471,
|
| 333 |
+
"linear_cka": 0.98828125
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 0.50390625,
|
| 337 |
+
"mean_cosine_similarity": 0.79296875,
|
| 338 |
+
"std_cosine_similarity": 0.275390625,
|
| 339 |
+
"mean_l2_distance": 25.75,
|
| 340 |
+
"std_l2_distance": 20.5,
|
| 341 |
+
"mean_dimension_correlation": 0.7685020446777344,
|
| 342 |
+
"std_dimension_correlation": 0.08637723380547804,
|
| 343 |
+
"linear_cka": 0.98828125
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 0.5078125,
|
| 347 |
+
"mean_cosine_similarity": 0.79296875,
|
| 348 |
+
"std_cosine_similarity": 0.271484375,
|
| 349 |
+
"mean_l2_distance": 26.0,
|
| 350 |
+
"std_l2_distance": 20.125,
|
| 351 |
+
"mean_dimension_correlation": 0.7681938171386719,
|
| 352 |
+
"std_dimension_correlation": 0.08170402844520411,
|
| 353 |
+
"linear_cka": 0.984375
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 0.5,
|
| 357 |
+
"mean_cosine_similarity": 0.7890625,
|
| 358 |
+
"std_cosine_similarity": 0.27734375,
|
| 359 |
+
"mean_l2_distance": 25.5,
|
| 360 |
+
"std_l2_distance": 20.75,
|
| 361 |
+
"mean_dimension_correlation": 0.7680191040039063,
|
| 362 |
+
"std_dimension_correlation": 0.08532466419571123,
|
| 363 |
+
"linear_cka": 0.98828125
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 1.3515625,
|
| 367 |
+
"mean_cosine_similarity": 0.05615234375,
|
| 368 |
+
"std_cosine_similarity": 0.1904296875,
|
| 369 |
+
"mean_l2_distance": 69.0,
|
| 370 |
+
"std_l2_distance": 7.3125,
|
| 371 |
+
"mean_dimension_correlation": 0.4670826017856598,
|
| 372 |
+
"std_dimension_correlation": 0.13384197305399426,
|
| 373 |
+
"linear_cka": 0.57421875
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 0.5078125,
|
| 377 |
+
"mean_cosine_similarity": 0.79296875,
|
| 378 |
+
"std_cosine_similarity": 0.2734375,
|
| 379 |
+
"mean_l2_distance": 26.0,
|
| 380 |
+
"std_l2_distance": 20.125,
|
| 381 |
+
"mean_dimension_correlation": 0.7681541442871094,
|
| 382 |
+
"std_dimension_correlation": 0.08380469982339137,
|
| 383 |
+
"linear_cka": 0.984375
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 0.48046875,
|
| 387 |
+
"mean_cosine_similarity": 0.8125,
|
| 388 |
+
"std_cosine_similarity": 0.251953125,
|
| 389 |
+
"mean_l2_distance": 24.5,
|
| 390 |
+
"std_l2_distance": 19.375,
|
| 391 |
+
"mean_dimension_correlation": 0.7896865844726563,
|
| 392 |
+
"std_dimension_correlation": 0.07704049416139637,
|
| 393 |
+
"linear_cka": 0.984375
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 0.48828125,
|
| 397 |
+
"mean_cosine_similarity": 0.80859375,
|
| 398 |
+
"std_cosine_similarity": 0.255859375,
|
| 399 |
+
"mean_l2_distance": 24.875,
|
| 400 |
+
"std_l2_distance": 19.5,
|
| 401 |
+
"mean_dimension_correlation": 0.7829521179199219,
|
| 402 |
+
"std_dimension_correlation": 0.07745501980282286,
|
| 403 |
+
"linear_cka": 0.98046875
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 0.5,
|
| 407 |
+
"mean_cosine_similarity": 0.7890625,
|
| 408 |
+
"std_cosine_similarity": 0.27734375,
|
| 409 |
+
"mean_l2_distance": 25.5,
|
| 410 |
+
"std_l2_distance": 20.75,
|
| 411 |
+
"mean_dimension_correlation": 0.7680793762207031,
|
| 412 |
+
"std_dimension_correlation": 0.08533230341332358,
|
| 413 |
+
"linear_cka": 0.98828125
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 0.783203125,
|
| 416 |
+
"std_mse": 0.4008924066126565,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.5505045572916667,
|
| 418 |
+
"std_mean_cosine_similarity": 0.3501489988188365,
|
| 419 |
+
"avg_std_cosine_similarity": 0.24108072916666667,
|
| 420 |
+
"std_std_cosine_similarity": 0.036733753214407784,
|
| 421 |
+
"avg_mean_l2_distance": 40.05,
|
| 422 |
+
"std_mean_l2_distance": 20.54668464740723,
|
| 423 |
+
"avg_std_l2_distance": 15.74375,
|
| 424 |
+
"std_std_l2_distance": 5.980928892460323,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.6713259566823642,
|
| 426 |
+
"std_mean_dimension_correlation": 0.14540630815784578,
|
| 427 |
+
"avg_std_dimension_correlation": 0.09921700445503585,
|
| 428 |
+
"std_std_dimension_correlation": 0.024890230217464338,
|
| 429 |
+
"avg_linear_cka": 0.85,
|
| 430 |
+
"std_linear_cka": 0.19227216969593736
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_5002240.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 5002240,
|
| 4 |
+
"global_step": 4885,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.515625,
|
| 7 |
+
"train/contrastive": 2.421875,
|
| 8 |
+
"train/recons_loss": 0.65234375,
|
| 9 |
+
"train/balance_loss": 3.84375,
|
| 10 |
+
"train/balance_loss_contrastive": 2.796875,
|
| 11 |
+
"train/balance_loss_recons": 1.0390625,
|
| 12 |
+
"train/contrastive_std": 3.296875,
|
| 13 |
+
"train/recons_std": 0.1279296875,
|
| 14 |
+
"train/contrastive_min": 0.10791015625,
|
| 15 |
+
"train/contrastive_max": 7.0,
|
| 16 |
+
"train/recons_min": 0.546875,
|
| 17 |
+
"train/recons_max": 0.89453125,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.89453125,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.58984375,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.578125,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.69140625,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.546875,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.61328125,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 5002240,
|
| 27 |
+
"train/step": 4885
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 4885,
|
| 31 |
+
"n_tokens": 5002240,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 9.649429321289062,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 8.296281814575195,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.075584411621094,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 8.38884162902832,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.30383014678955,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.307902336120605,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 5.658719062805176,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.32064151763916,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.324888229370117,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.3024439811706543,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.259655714035034,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.363274574279785,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 12.87409782409668,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.341550350189209,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 5.935274600982666,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.224505424499512,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.194956302642822,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.298384189605713,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 5.595909595489502,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.6820993423461914,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6489415168762207,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.6376867294311523,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.660701274871826,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.7599291801452637,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.069366455078125,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.4930596351623535,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.338548421859741,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.4058313369750977,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.42093825340271,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3004584312438965,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 6.583094120025635,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.2262253761291504,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.1968085765838623,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.333820343017578,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.3650805950164795,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.0115933418273926,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 3.5094382762908936,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.214106798171997,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 2.274066925048828,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.3673298358917236,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.390550136566162,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 2.258884906768799,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 3.714834690093994,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0173228979110718,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0794646739959717,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.068953275680542,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.0350369215011597,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.035721778869629,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 3.772433042526245,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.0048426389694214,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.02411949634552,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.0449342727661133,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.0301669836044312,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.027207612991333,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 2.8531670570373535,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.316890001296997,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.3466463088989258,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.3155006170272827,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.3215066194534302,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.3193069696426392,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 2.85866117477417,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.078108787536621,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.092585802078247,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.0788657665252686,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.037369728088379,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.057699203491211,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 3.228097438812256,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.1583912372589111,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.1884386539459229,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.1869480609893799,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.176148772239685,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.147931694984436
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 0.9609375,
|
| 117 |
+
"mean_cosine_similarity": 0.5078125,
|
| 118 |
+
"std_cosine_similarity": 0.2080078125,
|
| 119 |
+
"mean_l2_distance": 49.25,
|
| 120 |
+
"std_l2_distance": 10.375,
|
| 121 |
+
"mean_dimension_correlation": 0.656490707397461,
|
| 122 |
+
"std_dimension_correlation": 0.0958554699318414,
|
| 123 |
+
"linear_cka": 0.78125
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 0.9609375,
|
| 127 |
+
"mean_cosine_similarity": 0.50390625,
|
| 128 |
+
"std_cosine_similarity": 0.208984375,
|
| 129 |
+
"mean_l2_distance": 49.25,
|
| 130 |
+
"std_l2_distance": 10.4375,
|
| 131 |
+
"mean_dimension_correlation": 0.6543472290039063,
|
| 132 |
+
"std_dimension_correlation": 0.09762869101522263,
|
| 133 |
+
"linear_cka": 0.78125
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 0.9609375,
|
| 137 |
+
"mean_cosine_similarity": 0.5078125,
|
| 138 |
+
"std_cosine_similarity": 0.2080078125,
|
| 139 |
+
"mean_l2_distance": 49.25,
|
| 140 |
+
"std_l2_distance": 10.375,
|
| 141 |
+
"mean_dimension_correlation": 0.6573211669921875,
|
| 142 |
+
"std_dimension_correlation": 0.09694915743193826,
|
| 143 |
+
"linear_cka": 0.76953125
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 0.97265625,
|
| 147 |
+
"mean_cosine_similarity": 0.498046875,
|
| 148 |
+
"std_cosine_similarity": 0.20703125,
|
| 149 |
+
"mean_l2_distance": 49.75,
|
| 150 |
+
"std_l2_distance": 10.25,
|
| 151 |
+
"mean_dimension_correlation": 0.6492362976074219,
|
| 152 |
+
"std_dimension_correlation": 0.09888349567655164,
|
| 153 |
+
"linear_cka": 0.78125
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 0.96484375,
|
| 157 |
+
"mean_cosine_similarity": 0.50390625,
|
| 158 |
+
"std_cosine_similarity": 0.2099609375,
|
| 159 |
+
"mean_l2_distance": 49.5,
|
| 160 |
+
"std_l2_distance": 10.4375,
|
| 161 |
+
"mean_dimension_correlation": 0.6535564422607422,
|
| 162 |
+
"std_dimension_correlation": 0.09991360994754159,
|
| 163 |
+
"linear_cka": 0.7734375
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 0.9609375,
|
| 167 |
+
"mean_cosine_similarity": 0.5078125,
|
| 168 |
+
"std_cosine_similarity": 0.2080078125,
|
| 169 |
+
"mean_l2_distance": 49.25,
|
| 170 |
+
"std_l2_distance": 10.375,
|
| 171 |
+
"mean_dimension_correlation": 0.6565372467041015,
|
| 172 |
+
"std_dimension_correlation": 0.09589836585224641,
|
| 173 |
+
"linear_cka": 0.78125
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 0.39453125,
|
| 177 |
+
"mean_cosine_similarity": 0.8671875,
|
| 178 |
+
"std_cosine_similarity": 0.212890625,
|
| 179 |
+
"mean_l2_distance": 20.25,
|
| 180 |
+
"std_l2_distance": 16.875,
|
| 181 |
+
"mean_dimension_correlation": 0.8475028991699218,
|
| 182 |
+
"std_dimension_correlation": 0.0568063510608605,
|
| 183 |
+
"linear_cka": 0.984375
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 0.388671875,
|
| 187 |
+
"mean_cosine_similarity": 0.87109375,
|
| 188 |
+
"std_cosine_similarity": 0.2138671875,
|
| 189 |
+
"mean_l2_distance": 19.875,
|
| 190 |
+
"std_l2_distance": 17.0,
|
| 191 |
+
"mean_dimension_correlation": 0.8486709594726562,
|
| 192 |
+
"std_dimension_correlation": 0.05617761489018725,
|
| 193 |
+
"linear_cka": 0.984375
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 0.404296875,
|
| 197 |
+
"mean_cosine_similarity": 0.859375,
|
| 198 |
+
"std_cosine_similarity": 0.23046875,
|
| 199 |
+
"mean_l2_distance": 20.75,
|
| 200 |
+
"std_l2_distance": 17.75,
|
| 201 |
+
"mean_dimension_correlation": 0.837774658203125,
|
| 202 |
+
"std_dimension_correlation": 0.06036525651201483,
|
| 203 |
+
"linear_cka": 0.984375
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 0.3984375,
|
| 207 |
+
"mean_cosine_similarity": 0.859375,
|
| 208 |
+
"std_cosine_similarity": 0.2294921875,
|
| 209 |
+
"mean_l2_distance": 20.375,
|
| 210 |
+
"std_l2_distance": 17.75,
|
| 211 |
+
"mean_dimension_correlation": 0.8405960083007813,
|
| 212 |
+
"std_dimension_correlation": 0.06082946585887382,
|
| 213 |
+
"linear_cka": 0.98828125
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 0.9609375,
|
| 217 |
+
"mean_cosine_similarity": 0.50390625,
|
| 218 |
+
"std_cosine_similarity": 0.208984375,
|
| 219 |
+
"mean_l2_distance": 49.25,
|
| 220 |
+
"std_l2_distance": 10.4375,
|
| 221 |
+
"mean_dimension_correlation": 0.6544017791748047,
|
| 222 |
+
"std_dimension_correlation": 0.09771714306924677,
|
| 223 |
+
"linear_cka": 0.78125
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 0.39453125,
|
| 227 |
+
"mean_cosine_similarity": 0.8671875,
|
| 228 |
+
"std_cosine_similarity": 0.212890625,
|
| 229 |
+
"mean_l2_distance": 20.25,
|
| 230 |
+
"std_l2_distance": 16.875,
|
| 231 |
+
"mean_dimension_correlation": 0.8475006103515625,
|
| 232 |
+
"std_dimension_correlation": 0.05680066543049433,
|
| 233 |
+
"linear_cka": 0.984375
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 0.38671875,
|
| 237 |
+
"mean_cosine_similarity": 0.86328125,
|
| 238 |
+
"std_cosine_similarity": 0.2255859375,
|
| 239 |
+
"mean_l2_distance": 19.875,
|
| 240 |
+
"std_l2_distance": 17.75,
|
| 241 |
+
"mean_dimension_correlation": 0.8454933166503906,
|
| 242 |
+
"std_dimension_correlation": 0.05831595958380969,
|
| 243 |
+
"linear_cka": 0.9921875
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 0.396484375,
|
| 247 |
+
"mean_cosine_similarity": 0.859375,
|
| 248 |
+
"std_cosine_similarity": 0.2275390625,
|
| 249 |
+
"mean_l2_distance": 20.25,
|
| 250 |
+
"std_l2_distance": 17.75,
|
| 251 |
+
"mean_dimension_correlation": 0.840838623046875,
|
| 252 |
+
"std_dimension_correlation": 0.06208702710996684,
|
| 253 |
+
"linear_cka": 0.984375
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 0.37890625,
|
| 257 |
+
"mean_cosine_similarity": 0.875,
|
| 258 |
+
"std_cosine_similarity": 0.208984375,
|
| 259 |
+
"mean_l2_distance": 19.375,
|
| 260 |
+
"std_l2_distance": 16.875,
|
| 261 |
+
"mean_dimension_correlation": 0.8549148559570312,
|
| 262 |
+
"std_dimension_correlation": 0.05559449933392313,
|
| 263 |
+
"linear_cka": 0.98828125
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 0.9609375,
|
| 267 |
+
"mean_cosine_similarity": 0.5078125,
|
| 268 |
+
"std_cosine_similarity": 0.2080078125,
|
| 269 |
+
"mean_l2_distance": 49.25,
|
| 270 |
+
"std_l2_distance": 10.375,
|
| 271 |
+
"mean_dimension_correlation": 0.6572914123535156,
|
| 272 |
+
"std_dimension_correlation": 0.09700915659232981,
|
| 273 |
+
"linear_cka": 0.76953125
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 0.388671875,
|
| 277 |
+
"mean_cosine_similarity": 0.87109375,
|
| 278 |
+
"std_cosine_similarity": 0.2138671875,
|
| 279 |
+
"mean_l2_distance": 19.875,
|
| 280 |
+
"std_l2_distance": 17.0,
|
| 281 |
+
"mean_dimension_correlation": 0.8486358642578125,
|
| 282 |
+
"std_dimension_correlation": 0.05619399135067562,
|
| 283 |
+
"linear_cka": 0.984375
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 0.38671875,
|
| 287 |
+
"mean_cosine_similarity": 0.86328125,
|
| 288 |
+
"std_cosine_similarity": 0.2255859375,
|
| 289 |
+
"mean_l2_distance": 19.875,
|
| 290 |
+
"std_l2_distance": 17.75,
|
| 291 |
+
"mean_dimension_correlation": 0.8454788208007813,
|
| 292 |
+
"std_dimension_correlation": 0.05838818713320479,
|
| 293 |
+
"linear_cka": 0.9921875
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 0.396484375,
|
| 297 |
+
"mean_cosine_similarity": 0.86328125,
|
| 298 |
+
"std_cosine_similarity": 0.2255859375,
|
| 299 |
+
"mean_l2_distance": 20.375,
|
| 300 |
+
"std_l2_distance": 17.625,
|
| 301 |
+
"mean_dimension_correlation": 0.8422294616699219,
|
| 302 |
+
"std_dimension_correlation": 0.058264678286021505,
|
| 303 |
+
"linear_cka": 0.984375
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 0.37890625,
|
| 307 |
+
"mean_cosine_similarity": 0.87109375,
|
| 308 |
+
"std_cosine_similarity": 0.216796875,
|
| 309 |
+
"mean_l2_distance": 19.5,
|
| 310 |
+
"std_l2_distance": 17.25,
|
| 311 |
+
"mean_dimension_correlation": 0.8510147094726562,
|
| 312 |
+
"std_dimension_correlation": 0.05579116262302701,
|
| 313 |
+
"linear_cka": 0.98828125
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 0.97265625,
|
| 317 |
+
"mean_cosine_similarity": 0.498046875,
|
| 318 |
+
"std_cosine_similarity": 0.20703125,
|
| 319 |
+
"mean_l2_distance": 49.75,
|
| 320 |
+
"std_l2_distance": 10.25,
|
| 321 |
+
"mean_dimension_correlation": 0.6493141174316406,
|
| 322 |
+
"std_dimension_correlation": 0.09894686111103042,
|
| 323 |
+
"linear_cka": 0.78125
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 0.404296875,
|
| 327 |
+
"mean_cosine_similarity": 0.859375,
|
| 328 |
+
"std_cosine_similarity": 0.23046875,
|
| 329 |
+
"mean_l2_distance": 20.75,
|
| 330 |
+
"std_l2_distance": 17.75,
|
| 331 |
+
"mean_dimension_correlation": 0.837823486328125,
|
| 332 |
+
"std_dimension_correlation": 0.060340047867445096,
|
| 333 |
+
"linear_cka": 0.984375
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 0.396484375,
|
| 337 |
+
"mean_cosine_similarity": 0.859375,
|
| 338 |
+
"std_cosine_similarity": 0.2275390625,
|
| 339 |
+
"mean_l2_distance": 20.25,
|
| 340 |
+
"std_l2_distance": 17.75,
|
| 341 |
+
"mean_dimension_correlation": 0.8408378601074219,
|
| 342 |
+
"std_dimension_correlation": 0.06205699551364581,
|
| 343 |
+
"linear_cka": 0.984375
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 0.396484375,
|
| 347 |
+
"mean_cosine_similarity": 0.86328125,
|
| 348 |
+
"std_cosine_similarity": 0.2255859375,
|
| 349 |
+
"mean_l2_distance": 20.375,
|
| 350 |
+
"std_l2_distance": 17.625,
|
| 351 |
+
"mean_dimension_correlation": 0.8422599792480469,
|
| 352 |
+
"std_dimension_correlation": 0.05822862763556895,
|
| 353 |
+
"linear_cka": 0.984375
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 0.384765625,
|
| 357 |
+
"mean_cosine_similarity": 0.86328125,
|
| 358 |
+
"std_cosine_similarity": 0.2314453125,
|
| 359 |
+
"mean_l2_distance": 19.75,
|
| 360 |
+
"std_l2_distance": 18.25,
|
| 361 |
+
"mean_dimension_correlation": 0.842401123046875,
|
| 362 |
+
"std_dimension_correlation": 0.06118176940514725,
|
| 363 |
+
"linear_cka": 0.98828125
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 0.96484375,
|
| 367 |
+
"mean_cosine_similarity": 0.50390625,
|
| 368 |
+
"std_cosine_similarity": 0.2099609375,
|
| 369 |
+
"mean_l2_distance": 49.5,
|
| 370 |
+
"std_l2_distance": 10.4375,
|
| 371 |
+
"mean_dimension_correlation": 0.6536331176757812,
|
| 372 |
+
"std_dimension_correlation": 0.09992718456511775,
|
| 373 |
+
"linear_cka": 0.7734375
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 0.3984375,
|
| 377 |
+
"mean_cosine_similarity": 0.859375,
|
| 378 |
+
"std_cosine_similarity": 0.2294921875,
|
| 379 |
+
"mean_l2_distance": 20.375,
|
| 380 |
+
"std_l2_distance": 17.75,
|
| 381 |
+
"mean_dimension_correlation": 0.8406883239746094,
|
| 382 |
+
"std_dimension_correlation": 0.06084754257089133,
|
| 383 |
+
"linear_cka": 0.98828125
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 0.37890625,
|
| 387 |
+
"mean_cosine_similarity": 0.875,
|
| 388 |
+
"std_cosine_similarity": 0.208984375,
|
| 389 |
+
"mean_l2_distance": 19.375,
|
| 390 |
+
"std_l2_distance": 16.875,
|
| 391 |
+
"mean_dimension_correlation": 0.8549110412597656,
|
| 392 |
+
"std_dimension_correlation": 0.05561355528314661,
|
| 393 |
+
"linear_cka": 0.98828125
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 0.37890625,
|
| 397 |
+
"mean_cosine_similarity": 0.87109375,
|
| 398 |
+
"std_cosine_similarity": 0.216796875,
|
| 399 |
+
"mean_l2_distance": 19.5,
|
| 400 |
+
"std_l2_distance": 17.25,
|
| 401 |
+
"mean_dimension_correlation": 0.851116943359375,
|
| 402 |
+
"std_dimension_correlation": 0.05580195396667784,
|
| 403 |
+
"linear_cka": 0.98828125
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 0.384765625,
|
| 407 |
+
"mean_cosine_similarity": 0.86328125,
|
| 408 |
+
"std_cosine_similarity": 0.2314453125,
|
| 409 |
+
"mean_l2_distance": 19.75,
|
| 410 |
+
"std_l2_distance": 18.25,
|
| 411 |
+
"mean_dimension_correlation": 0.8424659729003906,
|
| 412 |
+
"std_dimension_correlation": 0.06116790445559799,
|
| 413 |
+
"linear_cka": 0.98828125
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 0.5819010416666667,
|
| 416 |
+
"std_mse": 0.27032309960351786,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.744921875,
|
| 418 |
+
"std_mean_cosine_similarity": 0.17021541336381552,
|
| 419 |
+
"avg_std_cosine_similarity": 0.21764322916666667,
|
| 420 |
+
"std_std_cosine_similarity": 0.00918084175662793,
|
| 421 |
+
"avg_mean_l2_distance": 29.825,
|
| 422 |
+
"std_mean_l2_distance": 13.846163728628952,
|
| 423 |
+
"avg_std_l2_distance": 15.116666666666667,
|
| 424 |
+
"std_std_l2_distance": 3.3721335564034565,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.781509501139323,
|
| 426 |
+
"std_mean_dimension_correlation": 0.09012204602156858,
|
| 427 |
+
"avg_std_dimension_correlation": 0.07165274636880824,
|
| 428 |
+
"std_std_dimension_correlation": 0.018653236358734372,
|
| 429 |
+
"avg_linear_cka": 0.9169270833333333,
|
| 430 |
+
"std_linear_cka": 0.09876420102706185
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_6002688.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 6002688,
|
| 4 |
+
"global_step": 5862,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.546875,
|
| 7 |
+
"train/contrastive": 2.453125,
|
| 8 |
+
"train/recons_loss": 0.6015625,
|
| 9 |
+
"train/balance_loss": 3.875,
|
| 10 |
+
"train/balance_loss_contrastive": 2.84375,
|
| 11 |
+
"train/balance_loss_recons": 1.0234375,
|
| 12 |
+
"train/contrastive_std": 3.359375,
|
| 13 |
+
"train/recons_std": 0.1044921875,
|
| 14 |
+
"train/contrastive_min": 0.0927734375,
|
| 15 |
+
"train/contrastive_max": 7.125,
|
| 16 |
+
"train/recons_min": 0.50390625,
|
| 17 |
+
"train/recons_max": 0.7890625,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.7890625,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.546875,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.5390625,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.65234375,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.50390625,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.578125,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 6002688,
|
| 27 |
+
"train/step": 5862
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 5862,
|
| 31 |
+
"n_tokens": 6002688,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 8.11236572265625,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.918520927429199,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.107908248901367,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.823939323425293,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.930037498474121,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.921452522277832,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 3.8407585620880127,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.2737579345703125,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.3659729957580566,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.320258617401123,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.256859302520752,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.30106782913208,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.883870601654053,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.919462203979492,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.107792377471924,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.342267036437988,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.5884199142456055,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.350939750671387,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 3.649061918258667,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.6456005573272705,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6709065437316895,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.648075580596924,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.622471332550049,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.786158800125122,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 4.197429656982422,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.494438648223877,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.0957608222961426,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.3116352558135986,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.3204939365386963,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3190274238586426,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 4.347363471984863,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.295062303543091,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.980980634689331,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.255995512008667,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.1774487495422363,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.962179183959961,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.7902801036834717,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.5620601177215576,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.5911345481872559,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.6545490026474,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.5992227792739868,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.5955195426940918,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.649550199508667,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9683128595352173,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0230571031570435,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.0230211019515991,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.976743221282959,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9846528768539429,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.6651275157928467,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9568565487861633,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9617317318916321,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9807920455932617,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.968661904335022,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9733489751815796,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.803094506263733,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.2474844455718994,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.2767094373703003,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.2572760581970215,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.2457572221755981,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.2488363981246948,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.6203731298446655,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.0437383651733398,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.0250771045684814,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.0189640522003174,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9739342927932739,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9979578852653503,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.6929349899291992,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.1436792612075806,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.1596219539642334,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.1562455892562866,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.1323540210723877,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.1177374124526978
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 0.55859375,
|
| 117 |
+
"mean_cosine_similarity": 0.80859375,
|
| 118 |
+
"std_cosine_similarity": 0.1884765625,
|
| 119 |
+
"mean_l2_distance": 28.625,
|
| 120 |
+
"std_l2_distance": 13.4375,
|
| 121 |
+
"mean_dimension_correlation": 0.8190200805664063,
|
| 122 |
+
"std_dimension_correlation": 0.05411914097024187,
|
| 123 |
+
"linear_cka": 0.8828125
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 0.56640625,
|
| 127 |
+
"mean_cosine_similarity": 0.80078125,
|
| 128 |
+
"std_cosine_similarity": 0.1943359375,
|
| 129 |
+
"mean_l2_distance": 29.0,
|
| 130 |
+
"std_l2_distance": 13.6875,
|
| 131 |
+
"mean_dimension_correlation": 0.8142623901367188,
|
| 132 |
+
"std_dimension_correlation": 0.05641095638371379,
|
| 133 |
+
"linear_cka": 0.8828125
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 0.55859375,
|
| 137 |
+
"mean_cosine_similarity": 0.80859375,
|
| 138 |
+
"std_cosine_similarity": 0.1884765625,
|
| 139 |
+
"mean_l2_distance": 28.75,
|
| 140 |
+
"std_l2_distance": 13.4375,
|
| 141 |
+
"mean_dimension_correlation": 0.8182327270507812,
|
| 142 |
+
"std_dimension_correlation": 0.05454624510823859,
|
| 143 |
+
"linear_cka": 0.875
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 0.56640625,
|
| 147 |
+
"mean_cosine_similarity": 0.80078125,
|
| 148 |
+
"std_cosine_similarity": 0.1953125,
|
| 149 |
+
"mean_l2_distance": 29.0,
|
| 150 |
+
"std_l2_distance": 13.8125,
|
| 151 |
+
"mean_dimension_correlation": 0.8126785278320312,
|
| 152 |
+
"std_dimension_correlation": 0.0564578377173453,
|
| 153 |
+
"linear_cka": 0.8828125
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 0.56640625,
|
| 157 |
+
"mean_cosine_similarity": 0.80078125,
|
| 158 |
+
"std_cosine_similarity": 0.1943359375,
|
| 159 |
+
"mean_l2_distance": 29.0,
|
| 160 |
+
"std_l2_distance": 13.75,
|
| 161 |
+
"mean_dimension_correlation": 0.8143714904785156,
|
| 162 |
+
"std_dimension_correlation": 0.05675514125293357,
|
| 163 |
+
"linear_cka": 0.8828125
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 0.55859375,
|
| 167 |
+
"mean_cosine_similarity": 0.80859375,
|
| 168 |
+
"std_cosine_similarity": 0.1884765625,
|
| 169 |
+
"mean_l2_distance": 28.625,
|
| 170 |
+
"std_l2_distance": 13.4375,
|
| 171 |
+
"mean_dimension_correlation": 0.818988037109375,
|
| 172 |
+
"std_dimension_correlation": 0.05413218674674489,
|
| 173 |
+
"linear_cka": 0.8828125
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 0.34375,
|
| 177 |
+
"mean_cosine_similarity": 0.8984375,
|
| 178 |
+
"std_cosine_similarity": 0.1865234375,
|
| 179 |
+
"mean_l2_distance": 17.625,
|
| 180 |
+
"std_l2_distance": 15.1875,
|
| 181 |
+
"mean_dimension_correlation": 0.8818618774414062,
|
| 182 |
+
"std_dimension_correlation": 0.04443958868205486,
|
| 183 |
+
"linear_cka": 0.9921875
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 0.337890625,
|
| 187 |
+
"mean_cosine_similarity": 0.8984375,
|
| 188 |
+
"std_cosine_similarity": 0.185546875,
|
| 189 |
+
"mean_l2_distance": 17.375,
|
| 190 |
+
"std_l2_distance": 15.1875,
|
| 191 |
+
"mean_dimension_correlation": 0.8834381103515625,
|
| 192 |
+
"std_dimension_correlation": 0.04373309871215753,
|
| 193 |
+
"linear_cka": 0.984375
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 0.349609375,
|
| 197 |
+
"mean_cosine_similarity": 0.890625,
|
| 198 |
+
"std_cosine_similarity": 0.2001953125,
|
| 199 |
+
"mean_l2_distance": 18.0,
|
| 200 |
+
"std_l2_distance": 15.9375,
|
| 201 |
+
"mean_dimension_correlation": 0.875439453125,
|
| 202 |
+
"std_dimension_correlation": 0.04714301734748471,
|
| 203 |
+
"linear_cka": 0.984375
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 0.345703125,
|
| 207 |
+
"mean_cosine_similarity": 0.89453125,
|
| 208 |
+
"std_cosine_similarity": 0.19921875,
|
| 209 |
+
"mean_l2_distance": 17.75,
|
| 210 |
+
"std_l2_distance": 15.875,
|
| 211 |
+
"mean_dimension_correlation": 0.8766807556152344,
|
| 212 |
+
"std_dimension_correlation": 0.048109971697749936,
|
| 213 |
+
"linear_cka": 0.98828125
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 0.56640625,
|
| 217 |
+
"mean_cosine_similarity": 0.80078125,
|
| 218 |
+
"std_cosine_similarity": 0.1943359375,
|
| 219 |
+
"mean_l2_distance": 29.0,
|
| 220 |
+
"std_l2_distance": 13.6875,
|
| 221 |
+
"mean_dimension_correlation": 0.814276123046875,
|
| 222 |
+
"std_dimension_correlation": 0.05643128448459502,
|
| 223 |
+
"linear_cka": 0.8828125
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 0.34375,
|
| 227 |
+
"mean_cosine_similarity": 0.8984375,
|
| 228 |
+
"std_cosine_similarity": 0.1865234375,
|
| 229 |
+
"mean_l2_distance": 17.625,
|
| 230 |
+
"std_l2_distance": 15.1875,
|
| 231 |
+
"mean_dimension_correlation": 0.88189697265625,
|
| 232 |
+
"std_dimension_correlation": 0.044490526216888565,
|
| 233 |
+
"linear_cka": 0.9921875
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 0.3359375,
|
| 237 |
+
"mean_cosine_similarity": 0.89453125,
|
| 238 |
+
"std_cosine_similarity": 0.197265625,
|
| 239 |
+
"mean_l2_distance": 17.25,
|
| 240 |
+
"std_l2_distance": 15.9375,
|
| 241 |
+
"mean_dimension_correlation": 0.880517578125,
|
| 242 |
+
"std_dimension_correlation": 0.0455623185686599,
|
| 243 |
+
"linear_cka": 0.9921875
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 0.34375,
|
| 247 |
+
"mean_cosine_similarity": 0.89453125,
|
| 248 |
+
"std_cosine_similarity": 0.19921875,
|
| 249 |
+
"mean_l2_distance": 17.5,
|
| 250 |
+
"std_l2_distance": 16.0,
|
| 251 |
+
"mean_dimension_correlation": 0.8772689819335937,
|
| 252 |
+
"std_dimension_correlation": 0.04894074305932899,
|
| 253 |
+
"linear_cka": 0.9921875
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 0.328125,
|
| 257 |
+
"mean_cosine_similarity": 0.90234375,
|
| 258 |
+
"std_cosine_similarity": 0.18359375,
|
| 259 |
+
"mean_l2_distance": 16.875,
|
| 260 |
+
"std_l2_distance": 15.1875,
|
| 261 |
+
"mean_dimension_correlation": 0.8873504638671875,
|
| 262 |
+
"std_dimension_correlation": 0.04405998009297244,
|
| 263 |
+
"linear_cka": 0.98828125
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 0.55859375,
|
| 267 |
+
"mean_cosine_similarity": 0.80859375,
|
| 268 |
+
"std_cosine_similarity": 0.1884765625,
|
| 269 |
+
"mean_l2_distance": 28.75,
|
| 270 |
+
"std_l2_distance": 13.4375,
|
| 271 |
+
"mean_dimension_correlation": 0.8182823181152343,
|
| 272 |
+
"std_dimension_correlation": 0.054536269965662375,
|
| 273 |
+
"linear_cka": 0.875
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 0.337890625,
|
| 277 |
+
"mean_cosine_similarity": 0.8984375,
|
| 278 |
+
"std_cosine_similarity": 0.185546875,
|
| 279 |
+
"mean_l2_distance": 17.375,
|
| 280 |
+
"std_l2_distance": 15.1875,
|
| 281 |
+
"mean_dimension_correlation": 0.8834640502929687,
|
| 282 |
+
"std_dimension_correlation": 0.043707021156640234,
|
| 283 |
+
"linear_cka": 0.984375
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 0.3359375,
|
| 287 |
+
"mean_cosine_similarity": 0.89453125,
|
| 288 |
+
"std_cosine_similarity": 0.197265625,
|
| 289 |
+
"mean_l2_distance": 17.25,
|
| 290 |
+
"std_l2_distance": 15.9375,
|
| 291 |
+
"mean_dimension_correlation": 0.880511474609375,
|
| 292 |
+
"std_dimension_correlation": 0.04552510428270067,
|
| 293 |
+
"linear_cka": 0.9921875
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 0.34375,
|
| 297 |
+
"mean_cosine_similarity": 0.89453125,
|
| 298 |
+
"std_cosine_similarity": 0.197265625,
|
| 299 |
+
"mean_l2_distance": 17.625,
|
| 300 |
+
"std_l2_distance": 15.875,
|
| 301 |
+
"mean_dimension_correlation": 0.878497314453125,
|
| 302 |
+
"std_dimension_correlation": 0.045715874336397684,
|
| 303 |
+
"linear_cka": 0.984375
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 0.330078125,
|
| 307 |
+
"mean_cosine_similarity": 0.90234375,
|
| 308 |
+
"std_cosine_similarity": 0.189453125,
|
| 309 |
+
"mean_l2_distance": 17.0,
|
| 310 |
+
"std_l2_distance": 15.4375,
|
| 311 |
+
"mean_dimension_correlation": 0.8847824096679687,
|
| 312 |
+
"std_dimension_correlation": 0.0437299377718827,
|
| 313 |
+
"linear_cka": 0.98828125
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 0.56640625,
|
| 317 |
+
"mean_cosine_similarity": 0.80078125,
|
| 318 |
+
"std_cosine_similarity": 0.1953125,
|
| 319 |
+
"mean_l2_distance": 29.0,
|
| 320 |
+
"std_l2_distance": 13.8125,
|
| 321 |
+
"mean_dimension_correlation": 0.8127197265625,
|
| 322 |
+
"std_dimension_correlation": 0.05649289036639174,
|
| 323 |
+
"linear_cka": 0.8828125
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 0.349609375,
|
| 327 |
+
"mean_cosine_similarity": 0.890625,
|
| 328 |
+
"std_cosine_similarity": 0.2001953125,
|
| 329 |
+
"mean_l2_distance": 18.0,
|
| 330 |
+
"std_l2_distance": 15.9375,
|
| 331 |
+
"mean_dimension_correlation": 0.8753738403320312,
|
| 332 |
+
"std_dimension_correlation": 0.047172969623106395,
|
| 333 |
+
"linear_cka": 0.984375
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 0.34375,
|
| 337 |
+
"mean_cosine_similarity": 0.89453125,
|
| 338 |
+
"std_cosine_similarity": 0.19921875,
|
| 339 |
+
"mean_l2_distance": 17.5,
|
| 340 |
+
"std_l2_distance": 16.0,
|
| 341 |
+
"mean_dimension_correlation": 0.8773391723632813,
|
| 342 |
+
"std_dimension_correlation": 0.048965809084662616,
|
| 343 |
+
"linear_cka": 0.9921875
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 0.34375,
|
| 347 |
+
"mean_cosine_similarity": 0.89453125,
|
| 348 |
+
"std_cosine_similarity": 0.197265625,
|
| 349 |
+
"mean_l2_distance": 17.625,
|
| 350 |
+
"std_l2_distance": 15.875,
|
| 351 |
+
"mean_dimension_correlation": 0.8785049438476562,
|
| 352 |
+
"std_dimension_correlation": 0.04566486947653503,
|
| 353 |
+
"linear_cka": 0.984375
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 0.330078125,
|
| 357 |
+
"mean_cosine_similarity": 0.89453125,
|
| 358 |
+
"std_cosine_similarity": 0.2021484375,
|
| 359 |
+
"mean_l2_distance": 16.875,
|
| 360 |
+
"std_l2_distance": 16.5,
|
| 361 |
+
"mean_dimension_correlation": 0.8791839599609375,
|
| 362 |
+
"std_dimension_correlation": 0.04851861504925165,
|
| 363 |
+
"linear_cka": 0.99609375
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 0.56640625,
|
| 367 |
+
"mean_cosine_similarity": 0.80078125,
|
| 368 |
+
"std_cosine_similarity": 0.1943359375,
|
| 369 |
+
"mean_l2_distance": 29.0,
|
| 370 |
+
"std_l2_distance": 13.75,
|
| 371 |
+
"mean_dimension_correlation": 0.8144371032714843,
|
| 372 |
+
"std_dimension_correlation": 0.056772418466086064,
|
| 373 |
+
"linear_cka": 0.8828125
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 0.345703125,
|
| 377 |
+
"mean_cosine_similarity": 0.89453125,
|
| 378 |
+
"std_cosine_similarity": 0.19921875,
|
| 379 |
+
"mean_l2_distance": 17.75,
|
| 380 |
+
"std_l2_distance": 15.875,
|
| 381 |
+
"mean_dimension_correlation": 0.8767280578613281,
|
| 382 |
+
"std_dimension_correlation": 0.04809051339837505,
|
| 383 |
+
"linear_cka": 0.98828125
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 0.328125,
|
| 387 |
+
"mean_cosine_similarity": 0.90234375,
|
| 388 |
+
"std_cosine_similarity": 0.18359375,
|
| 389 |
+
"mean_l2_distance": 16.875,
|
| 390 |
+
"std_l2_distance": 15.1875,
|
| 391 |
+
"mean_dimension_correlation": 0.88740234375,
|
| 392 |
+
"std_dimension_correlation": 0.04400411105457499,
|
| 393 |
+
"linear_cka": 0.98828125
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 0.330078125,
|
| 397 |
+
"mean_cosine_similarity": 0.90234375,
|
| 398 |
+
"std_cosine_similarity": 0.189453125,
|
| 399 |
+
"mean_l2_distance": 17.0,
|
| 400 |
+
"std_l2_distance": 15.4375,
|
| 401 |
+
"mean_dimension_correlation": 0.8847671508789062,
|
| 402 |
+
"std_dimension_correlation": 0.04373389353658331,
|
| 403 |
+
"linear_cka": 0.98828125
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 0.330078125,
|
| 407 |
+
"mean_cosine_similarity": 0.89453125,
|
| 408 |
+
"std_cosine_similarity": 0.2021484375,
|
| 409 |
+
"mean_l2_distance": 16.875,
|
| 410 |
+
"std_l2_distance": 16.5,
|
| 411 |
+
"mean_dimension_correlation": 0.8792098999023438,
|
| 412 |
+
"std_dimension_correlation": 0.04842937345071404,
|
| 413 |
+
"linear_cka": 0.99609375
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 0.413671875,
|
| 416 |
+
"std_mse": 0.10597438593952982,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.865625,
|
| 418 |
+
"std_mean_cosine_similarity": 0.04379647828783345,
|
| 419 |
+
"avg_std_cosine_similarity": 0.19342447916666666,
|
| 420 |
+
"std_std_cosine_similarity": 0.005743944010446035,
|
| 421 |
+
"avg_mean_l2_distance": 21.216666666666665,
|
| 422 |
+
"std_mean_l2_distance": 5.424187087071717,
|
| 423 |
+
"avg_std_l2_distance": 15.016666666666667,
|
| 424 |
+
"std_std_l2_distance": 1.0459412294940647,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.858916244506836,
|
| 426 |
+
"std_mean_dimension_correlation": 0.03071649327798749,
|
| 427 |
+
"avg_std_dimension_correlation": 0.04921305693535582,
|
| 428 |
+
"std_std_dimension_correlation": 0.004870255293926663,
|
| 429 |
+
"avg_linear_cka": 0.953125,
|
| 430 |
+
"std_linear_cka": 0.05095123792248166
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_7003136.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 7003136,
|
| 4 |
+
"global_step": 6839,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.546875,
|
| 7 |
+
"train/contrastive": 2.453125,
|
| 8 |
+
"train/recons_loss": 0.5859375,
|
| 9 |
+
"train/balance_loss": 3.875,
|
| 10 |
+
"train/balance_loss_contrastive": 2.859375,
|
| 11 |
+
"train/balance_loss_recons": 1.015625,
|
| 12 |
+
"train/contrastive_std": 3.390625,
|
| 13 |
+
"train/recons_std": 0.07177734375,
|
| 14 |
+
"train/contrastive_min": 0.0849609375,
|
| 15 |
+
"train/contrastive_max": 7.1875,
|
| 16 |
+
"train/recons_min": 0.49609375,
|
| 17 |
+
"train/recons_max": 0.68359375,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.68359375,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.55859375,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.5390625,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.65625,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.49609375,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.57421875,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 7003136,
|
| 27 |
+
"train/step": 6839
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 6839,
|
| 31 |
+
"n_tokens": 7003136,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 7.436762809753418,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 7.0658464431762695,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 7.481184482574463,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 7.2030930519104,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.240965366363525,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 7.4969305992126465,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.213552474975586,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.2100319862365723,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.3322105407714844,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.236471652984619,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.2212750911712646,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.308291435241699,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.611138343811035,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.940827369689941,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.0901408195495605,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.319899559020996,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.241495609283447,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.30285120010376,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.640443801879883,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.575563907623291,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.661689043045044,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.5703558921813965,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.555753707885742,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.7401225566864014,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.5014476776123047,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.3176639080047607,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.024665355682373,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.2373054027557373,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.237010955810547,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.1157214641571045,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.1809849739074707,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.101025104522705,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.89253568649292,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0308847427368164,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.043917179107666,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7547640800476074,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.3392760753631592,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.2929567098617554,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.3323848247528076,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.3409219980239868,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.3193416595458984,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.3225743770599365,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 0.9926112294197083,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9511648416519165,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9907838106155396,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.0008413791656494,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9629853367805481,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9665719270706177,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 0.9707884788513184,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9194974899291992,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9195350408554077,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9375813603401184,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.9312270283699036,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9314996600151062,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.2487989664077759,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.2148901224136353,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.2322641611099243,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.2202345132827759,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.212104320526123,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.2153819799423218,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 0.9843270182609558,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9733158349990845,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9586274027824402,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.9659514427185059,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9286855459213257,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.941694438457489,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.0964877605438232,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0766807794570923,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0825374126434326,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.0911656618118286,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0675519704818726,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.0499017238616943
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 0.37890625,
|
| 117 |
+
"mean_cosine_similarity": 0.89453125,
|
| 118 |
+
"std_cosine_similarity": 0.1748046875,
|
| 119 |
+
"mean_l2_distance": 19.375,
|
| 120 |
+
"std_l2_distance": 13.5,
|
| 121 |
+
"mean_dimension_correlation": 0.88087158203125,
|
| 122 |
+
"std_dimension_correlation": 0.03891650382660663,
|
| 123 |
+
"linear_cka": 0.96875
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 0.384765625,
|
| 127 |
+
"mean_cosine_similarity": 0.890625,
|
| 128 |
+
"std_cosine_similarity": 0.1806640625,
|
| 129 |
+
"mean_l2_distance": 19.625,
|
| 130 |
+
"std_l2_distance": 13.8125,
|
| 131 |
+
"mean_dimension_correlation": 0.8772735595703125,
|
| 132 |
+
"std_dimension_correlation": 0.040613200120000074,
|
| 133 |
+
"linear_cka": 0.96875
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 0.380859375,
|
| 137 |
+
"mean_cosine_similarity": 0.89453125,
|
| 138 |
+
"std_cosine_similarity": 0.1728515625,
|
| 139 |
+
"mean_l2_distance": 19.5,
|
| 140 |
+
"std_l2_distance": 13.25,
|
| 141 |
+
"mean_dimension_correlation": 0.8814620971679688,
|
| 142 |
+
"std_dimension_correlation": 0.03743361738689456,
|
| 143 |
+
"linear_cka": 0.96875
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 0.380859375,
|
| 147 |
+
"mean_cosine_similarity": 0.890625,
|
| 148 |
+
"std_cosine_similarity": 0.1787109375,
|
| 149 |
+
"mean_l2_distance": 19.625,
|
| 150 |
+
"std_l2_distance": 13.6875,
|
| 151 |
+
"mean_dimension_correlation": 0.8784408569335938,
|
| 152 |
+
"std_dimension_correlation": 0.03935897183860133,
|
| 153 |
+
"linear_cka": 0.96875
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 0.37890625,
|
| 157 |
+
"mean_cosine_similarity": 0.89453125,
|
| 158 |
+
"std_cosine_similarity": 0.177734375,
|
| 159 |
+
"mean_l2_distance": 19.5,
|
| 160 |
+
"std_l2_distance": 13.5625,
|
| 161 |
+
"mean_dimension_correlation": 0.880218505859375,
|
| 162 |
+
"std_dimension_correlation": 0.03827009184402313,
|
| 163 |
+
"linear_cka": 0.96875
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 0.37890625,
|
| 167 |
+
"mean_cosine_similarity": 0.89453125,
|
| 168 |
+
"std_cosine_similarity": 0.1748046875,
|
| 169 |
+
"mean_l2_distance": 19.375,
|
| 170 |
+
"std_l2_distance": 13.5,
|
| 171 |
+
"mean_dimension_correlation": 0.8809066772460937,
|
| 172 |
+
"std_dimension_correlation": 0.038947862226715424,
|
| 173 |
+
"linear_cka": 0.96875
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 0.3125,
|
| 177 |
+
"mean_cosine_similarity": 0.9140625,
|
| 178 |
+
"std_cosine_similarity": 0.171875,
|
| 179 |
+
"mean_l2_distance": 16.0,
|
| 180 |
+
"std_l2_distance": 14.1875,
|
| 181 |
+
"mean_dimension_correlation": 0.900384521484375,
|
| 182 |
+
"std_dimension_correlation": 0.03780791654866101,
|
| 183 |
+
"linear_cka": 0.984375
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 0.30859375,
|
| 187 |
+
"mean_cosine_similarity": 0.9140625,
|
| 188 |
+
"std_cosine_similarity": 0.171875,
|
| 189 |
+
"mean_l2_distance": 15.75,
|
| 190 |
+
"std_l2_distance": 14.1875,
|
| 191 |
+
"mean_dimension_correlation": 0.9015731811523438,
|
| 192 |
+
"std_dimension_correlation": 0.037591582433134804,
|
| 193 |
+
"linear_cka": 0.984375
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 0.314453125,
|
| 197 |
+
"mean_cosine_similarity": 0.91015625,
|
| 198 |
+
"std_cosine_similarity": 0.18359375,
|
| 199 |
+
"mean_l2_distance": 16.125,
|
| 200 |
+
"std_l2_distance": 14.75,
|
| 201 |
+
"mean_dimension_correlation": 0.8960525512695312,
|
| 202 |
+
"std_dimension_correlation": 0.0391310064588162,
|
| 203 |
+
"linear_cka": 0.984375
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 0.3125,
|
| 207 |
+
"mean_cosine_similarity": 0.91015625,
|
| 208 |
+
"std_cosine_similarity": 0.1826171875,
|
| 209 |
+
"mean_l2_distance": 15.9375,
|
| 210 |
+
"std_l2_distance": 14.75,
|
| 211 |
+
"mean_dimension_correlation": 0.8973068237304688,
|
| 212 |
+
"std_dimension_correlation": 0.040051008367224694,
|
| 213 |
+
"linear_cka": 0.984375
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 0.384765625,
|
| 217 |
+
"mean_cosine_similarity": 0.890625,
|
| 218 |
+
"std_cosine_similarity": 0.1806640625,
|
| 219 |
+
"mean_l2_distance": 19.625,
|
| 220 |
+
"std_l2_distance": 13.8125,
|
| 221 |
+
"mean_dimension_correlation": 0.877227783203125,
|
| 222 |
+
"std_dimension_correlation": 0.04062615493819983,
|
| 223 |
+
"linear_cka": 0.96875
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 0.3125,
|
| 227 |
+
"mean_cosine_similarity": 0.9140625,
|
| 228 |
+
"std_cosine_similarity": 0.171875,
|
| 229 |
+
"mean_l2_distance": 16.0,
|
| 230 |
+
"std_l2_distance": 14.1875,
|
| 231 |
+
"mean_dimension_correlation": 0.9004119873046875,
|
| 232 |
+
"std_dimension_correlation": 0.037833126797664623,
|
| 233 |
+
"linear_cka": 0.984375
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 0.302734375,
|
| 237 |
+
"mean_cosine_similarity": 0.9140625,
|
| 238 |
+
"std_cosine_similarity": 0.181640625,
|
| 239 |
+
"mean_l2_distance": 15.5625,
|
| 240 |
+
"std_l2_distance": 14.875,
|
| 241 |
+
"mean_dimension_correlation": 0.8998611450195313,
|
| 242 |
+
"std_dimension_correlation": 0.03902960080299192,
|
| 243 |
+
"linear_cka": 0.984375
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 0.306640625,
|
| 247 |
+
"mean_cosine_similarity": 0.91015625,
|
| 248 |
+
"std_cosine_similarity": 0.1826171875,
|
| 249 |
+
"mean_l2_distance": 15.6875,
|
| 250 |
+
"std_l2_distance": 14.9375,
|
| 251 |
+
"mean_dimension_correlation": 0.8979522705078125,
|
| 252 |
+
"std_dimension_correlation": 0.04081550083822763,
|
| 253 |
+
"linear_cka": 0.984375
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 0.296875,
|
| 257 |
+
"mean_cosine_similarity": 0.91796875,
|
| 258 |
+
"std_cosine_similarity": 0.1689453125,
|
| 259 |
+
"mean_l2_distance": 15.1875,
|
| 260 |
+
"std_l2_distance": 14.1875,
|
| 261 |
+
"mean_dimension_correlation": 0.9056289672851563,
|
| 262 |
+
"std_dimension_correlation": 0.037629372700621964,
|
| 263 |
+
"linear_cka": 0.984375
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 0.380859375,
|
| 267 |
+
"mean_cosine_similarity": 0.89453125,
|
| 268 |
+
"std_cosine_similarity": 0.1728515625,
|
| 269 |
+
"mean_l2_distance": 19.5,
|
| 270 |
+
"std_l2_distance": 13.25,
|
| 271 |
+
"mean_dimension_correlation": 0.881549072265625,
|
| 272 |
+
"std_dimension_correlation": 0.037422879211554134,
|
| 273 |
+
"linear_cka": 0.96875
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 0.30859375,
|
| 277 |
+
"mean_cosine_similarity": 0.9140625,
|
| 278 |
+
"std_cosine_similarity": 0.171875,
|
| 279 |
+
"mean_l2_distance": 15.75,
|
| 280 |
+
"std_l2_distance": 14.1875,
|
| 281 |
+
"mean_dimension_correlation": 0.9015762329101562,
|
| 282 |
+
"std_dimension_correlation": 0.03752165553415462,
|
| 283 |
+
"linear_cka": 0.984375
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 0.302734375,
|
| 287 |
+
"mean_cosine_similarity": 0.9140625,
|
| 288 |
+
"std_cosine_similarity": 0.181640625,
|
| 289 |
+
"mean_l2_distance": 15.5625,
|
| 290 |
+
"std_l2_distance": 14.875,
|
| 291 |
+
"mean_dimension_correlation": 0.8998123168945312,
|
| 292 |
+
"std_dimension_correlation": 0.03903639037032718,
|
| 293 |
+
"linear_cka": 0.984375
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 0.30859375,
|
| 297 |
+
"mean_cosine_similarity": 0.9140625,
|
| 298 |
+
"std_cosine_similarity": 0.1806640625,
|
| 299 |
+
"mean_l2_distance": 15.8125,
|
| 300 |
+
"std_l2_distance": 14.75,
|
| 301 |
+
"mean_dimension_correlation": 0.8988906860351562,
|
| 302 |
+
"std_dimension_correlation": 0.038209415172214704,
|
| 303 |
+
"linear_cka": 0.9921875
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 0.298828125,
|
| 307 |
+
"mean_cosine_similarity": 0.91796875,
|
| 308 |
+
"std_cosine_similarity": 0.1748046875,
|
| 309 |
+
"mean_l2_distance": 15.3125,
|
| 310 |
+
"std_l2_distance": 14.4375,
|
| 311 |
+
"mean_dimension_correlation": 0.9034744262695312,
|
| 312 |
+
"std_dimension_correlation": 0.03725401269453125,
|
| 313 |
+
"linear_cka": 0.9921875
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 0.380859375,
|
| 317 |
+
"mean_cosine_similarity": 0.890625,
|
| 318 |
+
"std_cosine_similarity": 0.1787109375,
|
| 319 |
+
"mean_l2_distance": 19.625,
|
| 320 |
+
"std_l2_distance": 13.6875,
|
| 321 |
+
"mean_dimension_correlation": 0.8785064697265625,
|
| 322 |
+
"std_dimension_correlation": 0.0393290152752092,
|
| 323 |
+
"linear_cka": 0.96875
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 0.314453125,
|
| 327 |
+
"mean_cosine_similarity": 0.91015625,
|
| 328 |
+
"std_cosine_similarity": 0.18359375,
|
| 329 |
+
"mean_l2_distance": 16.125,
|
| 330 |
+
"std_l2_distance": 14.75,
|
| 331 |
+
"mean_dimension_correlation": 0.8960662841796875,
|
| 332 |
+
"std_dimension_correlation": 0.039119882279186446,
|
| 333 |
+
"linear_cka": 0.984375
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 0.306640625,
|
| 337 |
+
"mean_cosine_similarity": 0.91015625,
|
| 338 |
+
"std_cosine_similarity": 0.1826171875,
|
| 339 |
+
"mean_l2_distance": 15.6875,
|
| 340 |
+
"std_l2_distance": 14.9375,
|
| 341 |
+
"mean_dimension_correlation": 0.8979080200195313,
|
| 342 |
+
"std_dimension_correlation": 0.04092076296017269,
|
| 343 |
+
"linear_cka": 0.984375
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 0.30859375,
|
| 347 |
+
"mean_cosine_similarity": 0.9140625,
|
| 348 |
+
"std_cosine_similarity": 0.1806640625,
|
| 349 |
+
"mean_l2_distance": 15.8125,
|
| 350 |
+
"std_l2_distance": 14.75,
|
| 351 |
+
"mean_dimension_correlation": 0.8988739013671875,
|
| 352 |
+
"std_dimension_correlation": 0.03830457081641783,
|
| 353 |
+
"linear_cka": 0.9921875
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 0.29296875,
|
| 357 |
+
"mean_cosine_similarity": 0.9140625,
|
| 358 |
+
"std_cosine_similarity": 0.1845703125,
|
| 359 |
+
"mean_l2_distance": 15.0,
|
| 360 |
+
"std_l2_distance": 15.1875,
|
| 361 |
+
"mean_dimension_correlation": 0.9007888793945312,
|
| 362 |
+
"std_dimension_correlation": 0.03971286220373166,
|
| 363 |
+
"linear_cka": 0.9921875
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 0.37890625,
|
| 367 |
+
"mean_cosine_similarity": 0.89453125,
|
| 368 |
+
"std_cosine_similarity": 0.177734375,
|
| 369 |
+
"mean_l2_distance": 19.5,
|
| 370 |
+
"std_l2_distance": 13.5625,
|
| 371 |
+
"mean_dimension_correlation": 0.8801483154296875,
|
| 372 |
+
"std_dimension_correlation": 0.038289717180632434,
|
| 373 |
+
"linear_cka": 0.96875
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 0.3125,
|
| 377 |
+
"mean_cosine_similarity": 0.91015625,
|
| 378 |
+
"std_cosine_similarity": 0.1826171875,
|
| 379 |
+
"mean_l2_distance": 15.9375,
|
| 380 |
+
"std_l2_distance": 14.75,
|
| 381 |
+
"mean_dimension_correlation": 0.8973480224609375,
|
| 382 |
+
"std_dimension_correlation": 0.0400727561743654,
|
| 383 |
+
"linear_cka": 0.984375
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 0.296875,
|
| 387 |
+
"mean_cosine_similarity": 0.91796875,
|
| 388 |
+
"std_cosine_similarity": 0.1689453125,
|
| 389 |
+
"mean_l2_distance": 15.1875,
|
| 390 |
+
"std_l2_distance": 14.1875,
|
| 391 |
+
"mean_dimension_correlation": 0.9055801391601562,
|
| 392 |
+
"std_dimension_correlation": 0.0376429470480909,
|
| 393 |
+
"linear_cka": 0.984375
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 0.298828125,
|
| 397 |
+
"mean_cosine_similarity": 0.91796875,
|
| 398 |
+
"std_cosine_similarity": 0.1748046875,
|
| 399 |
+
"mean_l2_distance": 15.3125,
|
| 400 |
+
"std_l2_distance": 14.4375,
|
| 401 |
+
"mean_dimension_correlation": 0.9034759521484375,
|
| 402 |
+
"std_dimension_correlation": 0.03727643893244147,
|
| 403 |
+
"linear_cka": 0.9921875
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 0.29296875,
|
| 407 |
+
"mean_cosine_similarity": 0.9140625,
|
| 408 |
+
"std_cosine_similarity": 0.1845703125,
|
| 409 |
+
"mean_l2_distance": 15.0,
|
| 410 |
+
"std_l2_distance": 15.1875,
|
| 411 |
+
"mean_dimension_correlation": 0.9008010864257813,
|
| 412 |
+
"std_dimension_correlation": 0.03971378852358426,
|
| 413 |
+
"linear_cka": 0.9921875
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 0.33059895833333336,
|
| 416 |
+
"std_mse": 0.0360100791855751,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.9067708333333333,
|
| 418 |
+
"std_mean_cosine_similarity": 0.010072437294694645,
|
| 419 |
+
"avg_std_cosine_similarity": 0.17786458333333333,
|
| 420 |
+
"std_std_cosine_similarity": 0.0048221052570680215,
|
| 421 |
+
"avg_mean_l2_distance": 16.933333333333334,
|
| 422 |
+
"std_mean_l2_distance": 1.8555191696365978,
|
| 423 |
+
"avg_std_l2_distance": 14.270833333333334,
|
| 424 |
+
"std_std_l2_distance": 0.5816941254263752,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.8933457438151041,
|
| 426 |
+
"std_mean_dimension_correlation": 0.009972265018242275,
|
| 427 |
+
"avg_std_dimension_correlation": 0.0387960870501666,
|
| 428 |
+
"std_std_dimension_correlation": 0.0011301372196425136,
|
| 429 |
+
"avg_linear_cka": 0.9807291666666667,
|
| 430 |
+
"std_linear_cka": 0.008960755486502733
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_8003584.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 8003584,
|
| 4 |
+
"global_step": 7816,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.546875,
|
| 7 |
+
"train/contrastive": 2.453125,
|
| 8 |
+
"train/recons_loss": 0.5703125,
|
| 9 |
+
"train/balance_loss": 3.84375,
|
| 10 |
+
"train/balance_loss_contrastive": 2.84375,
|
| 11 |
+
"train/balance_loss_recons": 1.0078125,
|
| 12 |
+
"train/contrastive_std": 3.359375,
|
| 13 |
+
"train/recons_std": 0.0703125,
|
| 14 |
+
"train/contrastive_min": 0.083984375,
|
| 15 |
+
"train/contrastive_max": 7.125,
|
| 16 |
+
"train/recons_min": 0.48828125,
|
| 17 |
+
"train/recons_max": 0.671875,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.671875,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.54296875,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.52734375,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.640625,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.48828125,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.5625,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 8003584,
|
| 27 |
+
"train/step": 7816
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 7816,
|
| 31 |
+
"n_tokens": 8003584,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.801623344421387,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.516300201416016,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 6.550345420837402,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.498440742492676,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.312735080718994,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.4551262855529785,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.2260851860046387,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.1856892108917236,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.254146099090576,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.229769468307495,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.2037243843078613,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.2896828651428223,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.400465965270996,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.9340386390686035,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 5.794930458068848,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.900982856750488,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.348906517028809,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.4423675537109375,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.5666661262512207,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.535998821258545,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.4926912784576416,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.476747989654541,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.408336877822876,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.6492466926574707,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4851021766662598,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.303314685821533,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.0016140937805176,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.184553384780884,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.121729850769043,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.000966787338257,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.442514419555664,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.1136765480041504,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.937788486480713,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0111327171325684,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.0196948051452637,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7799510955810547,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.2630091905593872,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.2069993019104004,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.2386506795883179,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.2585456371307373,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.212580919265747,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.2229262590408325,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.0233924388885498,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9251772165298462,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9622151255607605,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9760592579841614,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9428697824478149,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9486178159713745,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.0079174041748047,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9031265377998352,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9057611227035522,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9231780767440796,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.9179145097732544,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9312993884086609,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.2595539093017578,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.169715166091919,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.1957802772521973,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1877433061599731,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.179739236831665,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.1788952350616455,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.0426846742630005,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9591526985168457,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9619539380073547,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.9698508977890015,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9279893636703491,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9385145902633667,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.1462980508804321,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0632051229476929,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0799243450164795,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.0858067274093628,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0611412525177002,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.0519263744354248
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 0.388671875,
|
| 117 |
+
"mean_cosine_similarity": 0.89453125,
|
| 118 |
+
"std_cosine_similarity": 0.15625,
|
| 119 |
+
"mean_l2_distance": 19.875,
|
| 120 |
+
"std_l2_distance": 12.375,
|
| 121 |
+
"mean_dimension_correlation": 0.890447998046875,
|
| 122 |
+
"std_dimension_correlation": 0.03419125987740356,
|
| 123 |
+
"linear_cka": 0.96484375
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 0.39453125,
|
| 127 |
+
"mean_cosine_similarity": 0.89453125,
|
| 128 |
+
"std_cosine_similarity": 0.162109375,
|
| 129 |
+
"mean_l2_distance": 20.125,
|
| 130 |
+
"std_l2_distance": 12.625,
|
| 131 |
+
"mean_dimension_correlation": 0.8867477416992188,
|
| 132 |
+
"std_dimension_correlation": 0.035491939390515204,
|
| 133 |
+
"linear_cka": 0.96484375
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 0.39453125,
|
| 137 |
+
"mean_cosine_similarity": 0.89453125,
|
| 138 |
+
"std_cosine_similarity": 0.154296875,
|
| 139 |
+
"mean_l2_distance": 20.125,
|
| 140 |
+
"std_l2_distance": 12.1875,
|
| 141 |
+
"mean_dimension_correlation": 0.889697265625,
|
| 142 |
+
"std_dimension_correlation": 0.03374281347550432,
|
| 143 |
+
"linear_cka": 0.96484375
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 0.390625,
|
| 147 |
+
"mean_cosine_similarity": 0.89453125,
|
| 148 |
+
"std_cosine_similarity": 0.1591796875,
|
| 149 |
+
"mean_l2_distance": 20.0,
|
| 150 |
+
"std_l2_distance": 12.5,
|
| 151 |
+
"mean_dimension_correlation": 0.8883514404296875,
|
| 152 |
+
"std_dimension_correlation": 0.035164283126066044,
|
| 153 |
+
"linear_cka": 0.96484375
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 0.388671875,
|
| 157 |
+
"mean_cosine_similarity": 0.89453125,
|
| 158 |
+
"std_cosine_similarity": 0.1591796875,
|
| 159 |
+
"mean_l2_distance": 20.0,
|
| 160 |
+
"std_l2_distance": 12.4375,
|
| 161 |
+
"mean_dimension_correlation": 0.8896194458007812,
|
| 162 |
+
"std_dimension_correlation": 0.03421083254072828,
|
| 163 |
+
"linear_cka": 0.96484375
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 0.388671875,
|
| 167 |
+
"mean_cosine_similarity": 0.89453125,
|
| 168 |
+
"std_cosine_similarity": 0.15625,
|
| 169 |
+
"mean_l2_distance": 19.875,
|
| 170 |
+
"std_l2_distance": 12.375,
|
| 171 |
+
"mean_dimension_correlation": 0.8904556274414063,
|
| 172 |
+
"std_dimension_correlation": 0.034210556225841876,
|
| 173 |
+
"linear_cka": 0.96484375
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 0.26953125,
|
| 177 |
+
"mean_cosine_similarity": 0.93359375,
|
| 178 |
+
"std_cosine_similarity": 0.1513671875,
|
| 179 |
+
"mean_l2_distance": 13.8125,
|
| 180 |
+
"std_l2_distance": 12.625,
|
| 181 |
+
"mean_dimension_correlation": 0.923016357421875,
|
| 182 |
+
"std_dimension_correlation": 0.029236331051580345,
|
| 183 |
+
"linear_cka": 0.984375
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 0.263671875,
|
| 187 |
+
"mean_cosine_similarity": 0.9375,
|
| 188 |
+
"std_cosine_similarity": 0.150390625,
|
| 189 |
+
"mean_l2_distance": 13.5625,
|
| 190 |
+
"std_l2_distance": 12.625,
|
| 191 |
+
"mean_dimension_correlation": 0.9244888305664063,
|
| 192 |
+
"std_dimension_correlation": 0.02919239611161659,
|
| 193 |
+
"linear_cka": 0.984375
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 0.271484375,
|
| 197 |
+
"mean_cosine_similarity": 0.93359375,
|
| 198 |
+
"std_cosine_similarity": 0.16015625,
|
| 199 |
+
"mean_l2_distance": 13.875,
|
| 200 |
+
"std_l2_distance": 13.0625,
|
| 201 |
+
"mean_dimension_correlation": 0.9205032348632812,
|
| 202 |
+
"std_dimension_correlation": 0.029844860484086543,
|
| 203 |
+
"linear_cka": 0.984375
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 0.267578125,
|
| 207 |
+
"mean_cosine_similarity": 0.93359375,
|
| 208 |
+
"std_cosine_similarity": 0.158203125,
|
| 209 |
+
"mean_l2_distance": 13.6875,
|
| 210 |
+
"std_l2_distance": 13.0,
|
| 211 |
+
"mean_dimension_correlation": 0.9218185424804688,
|
| 212 |
+
"std_dimension_correlation": 0.030954341854338954,
|
| 213 |
+
"linear_cka": 0.984375
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 0.39453125,
|
| 217 |
+
"mean_cosine_similarity": 0.89453125,
|
| 218 |
+
"std_cosine_similarity": 0.162109375,
|
| 219 |
+
"mean_l2_distance": 20.125,
|
| 220 |
+
"std_l2_distance": 12.625,
|
| 221 |
+
"mean_dimension_correlation": 0.8868682861328125,
|
| 222 |
+
"std_dimension_correlation": 0.03559183889902671,
|
| 223 |
+
"linear_cka": 0.96484375
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 0.26953125,
|
| 227 |
+
"mean_cosine_similarity": 0.93359375,
|
| 228 |
+
"std_cosine_similarity": 0.1513671875,
|
| 229 |
+
"mean_l2_distance": 13.8125,
|
| 230 |
+
"std_l2_distance": 12.625,
|
| 231 |
+
"mean_dimension_correlation": 0.9229568481445313,
|
| 232 |
+
"std_dimension_correlation": 0.029229316660619842,
|
| 233 |
+
"linear_cka": 0.984375
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 0.2578125,
|
| 237 |
+
"mean_cosine_similarity": 0.93359375,
|
| 238 |
+
"std_cosine_similarity": 0.16015625,
|
| 239 |
+
"mean_l2_distance": 13.25,
|
| 240 |
+
"std_l2_distance": 13.25,
|
| 241 |
+
"mean_dimension_correlation": 0.923333740234375,
|
| 242 |
+
"std_dimension_correlation": 0.030134410337098863,
|
| 243 |
+
"linear_cka": 0.984375
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 0.26171875,
|
| 247 |
+
"mean_cosine_similarity": 0.93359375,
|
| 248 |
+
"std_cosine_similarity": 0.16015625,
|
| 249 |
+
"mean_l2_distance": 13.4375,
|
| 250 |
+
"std_l2_distance": 13.25,
|
| 251 |
+
"mean_dimension_correlation": 0.9219314575195312,
|
| 252 |
+
"std_dimension_correlation": 0.03136389625872561,
|
| 253 |
+
"linear_cka": 0.984375
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 0.25390625,
|
| 257 |
+
"mean_cosine_similarity": 0.9375,
|
| 258 |
+
"std_cosine_similarity": 0.1484375,
|
| 259 |
+
"mean_l2_distance": 13.0625,
|
| 260 |
+
"std_l2_distance": 12.625,
|
| 261 |
+
"mean_dimension_correlation": 0.92755126953125,
|
| 262 |
+
"std_dimension_correlation": 0.02898992593261031,
|
| 263 |
+
"linear_cka": 0.984375
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 0.39453125,
|
| 267 |
+
"mean_cosine_similarity": 0.89453125,
|
| 268 |
+
"std_cosine_similarity": 0.154296875,
|
| 269 |
+
"mean_l2_distance": 20.125,
|
| 270 |
+
"std_l2_distance": 12.1875,
|
| 271 |
+
"mean_dimension_correlation": 0.8896469116210938,
|
| 272 |
+
"std_dimension_correlation": 0.03377379140546021,
|
| 273 |
+
"linear_cka": 0.96484375
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 0.263671875,
|
| 277 |
+
"mean_cosine_similarity": 0.9375,
|
| 278 |
+
"std_cosine_similarity": 0.150390625,
|
| 279 |
+
"mean_l2_distance": 13.5625,
|
| 280 |
+
"std_l2_distance": 12.625,
|
| 281 |
+
"mean_dimension_correlation": 0.9245574951171875,
|
| 282 |
+
"std_dimension_correlation": 0.029099754782990043,
|
| 283 |
+
"linear_cka": 0.984375
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 0.2578125,
|
| 287 |
+
"mean_cosine_similarity": 0.93359375,
|
| 288 |
+
"std_cosine_similarity": 0.16015625,
|
| 289 |
+
"mean_l2_distance": 13.25,
|
| 290 |
+
"std_l2_distance": 13.25,
|
| 291 |
+
"mean_dimension_correlation": 0.9233123779296875,
|
| 292 |
+
"std_dimension_correlation": 0.030156395218800952,
|
| 293 |
+
"linear_cka": 0.984375
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 0.263671875,
|
| 297 |
+
"mean_cosine_similarity": 0.93359375,
|
| 298 |
+
"std_cosine_similarity": 0.1572265625,
|
| 299 |
+
"mean_l2_distance": 13.5625,
|
| 300 |
+
"std_l2_distance": 13.0625,
|
| 301 |
+
"mean_dimension_correlation": 0.9226715087890625,
|
| 302 |
+
"std_dimension_correlation": 0.02929662688468137,
|
| 303 |
+
"linear_cka": 0.984375
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 0.255859375,
|
| 307 |
+
"mean_cosine_similarity": 0.9375,
|
| 308 |
+
"std_cosine_similarity": 0.1533203125,
|
| 309 |
+
"mean_l2_distance": 13.125,
|
| 310 |
+
"std_l2_distance": 12.8125,
|
| 311 |
+
"mean_dimension_correlation": 0.9262313842773438,
|
| 312 |
+
"std_dimension_correlation": 0.029011160291782537,
|
| 313 |
+
"linear_cka": 0.984375
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 0.390625,
|
| 317 |
+
"mean_cosine_similarity": 0.89453125,
|
| 318 |
+
"std_cosine_similarity": 0.1591796875,
|
| 319 |
+
"mean_l2_distance": 20.0,
|
| 320 |
+
"std_l2_distance": 12.5,
|
| 321 |
+
"mean_dimension_correlation": 0.8883377075195312,
|
| 322 |
+
"std_dimension_correlation": 0.03512599620173197,
|
| 323 |
+
"linear_cka": 0.96484375
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 0.271484375,
|
| 327 |
+
"mean_cosine_similarity": 0.93359375,
|
| 328 |
+
"std_cosine_similarity": 0.16015625,
|
| 329 |
+
"mean_l2_distance": 13.875,
|
| 330 |
+
"std_l2_distance": 13.0625,
|
| 331 |
+
"mean_dimension_correlation": 0.9205001831054688,
|
| 332 |
+
"std_dimension_correlation": 0.02990616928878693,
|
| 333 |
+
"linear_cka": 0.984375
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 0.26171875,
|
| 337 |
+
"mean_cosine_similarity": 0.93359375,
|
| 338 |
+
"std_cosine_similarity": 0.16015625,
|
| 339 |
+
"mean_l2_distance": 13.4375,
|
| 340 |
+
"std_l2_distance": 13.25,
|
| 341 |
+
"mean_dimension_correlation": 0.922039794921875,
|
| 342 |
+
"std_dimension_correlation": 0.03143896607512693,
|
| 343 |
+
"linear_cka": 0.984375
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 0.263671875,
|
| 347 |
+
"mean_cosine_similarity": 0.93359375,
|
| 348 |
+
"std_cosine_similarity": 0.1572265625,
|
| 349 |
+
"mean_l2_distance": 13.5625,
|
| 350 |
+
"std_l2_distance": 13.0625,
|
| 351 |
+
"mean_dimension_correlation": 0.9226806640625,
|
| 352 |
+
"std_dimension_correlation": 0.029339070768690877,
|
| 353 |
+
"linear_cka": 0.984375
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 0.24609375,
|
| 357 |
+
"mean_cosine_similarity": 0.9375,
|
| 358 |
+
"std_cosine_similarity": 0.1591796875,
|
| 359 |
+
"mean_l2_distance": 12.625,
|
| 360 |
+
"std_l2_distance": 13.375,
|
| 361 |
+
"mean_dimension_correlation": 0.9257278442382812,
|
| 362 |
+
"std_dimension_correlation": 0.030489491126206747,
|
| 363 |
+
"linear_cka": 0.984375
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 0.388671875,
|
| 367 |
+
"mean_cosine_similarity": 0.89453125,
|
| 368 |
+
"std_cosine_similarity": 0.1591796875,
|
| 369 |
+
"mean_l2_distance": 20.0,
|
| 370 |
+
"std_l2_distance": 12.4375,
|
| 371 |
+
"mean_dimension_correlation": 0.8896011352539063,
|
| 372 |
+
"std_dimension_correlation": 0.034245117741804325,
|
| 373 |
+
"linear_cka": 0.96484375
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 0.267578125,
|
| 377 |
+
"mean_cosine_similarity": 0.93359375,
|
| 378 |
+
"std_cosine_similarity": 0.158203125,
|
| 379 |
+
"mean_l2_distance": 13.6875,
|
| 380 |
+
"std_l2_distance": 13.0,
|
| 381 |
+
"mean_dimension_correlation": 0.9218338012695313,
|
| 382 |
+
"std_dimension_correlation": 0.03096110466803191,
|
| 383 |
+
"linear_cka": 0.984375
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 0.25390625,
|
| 387 |
+
"mean_cosine_similarity": 0.9375,
|
| 388 |
+
"std_cosine_similarity": 0.1484375,
|
| 389 |
+
"mean_l2_distance": 13.0625,
|
| 390 |
+
"std_l2_distance": 12.625,
|
| 391 |
+
"mean_dimension_correlation": 0.9275863647460938,
|
| 392 |
+
"std_dimension_correlation": 0.029019101935420444,
|
| 393 |
+
"linear_cka": 0.984375
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 0.255859375,
|
| 397 |
+
"mean_cosine_similarity": 0.9375,
|
| 398 |
+
"std_cosine_similarity": 0.1533203125,
|
| 399 |
+
"mean_l2_distance": 13.125,
|
| 400 |
+
"std_l2_distance": 12.8125,
|
| 401 |
+
"mean_dimension_correlation": 0.9262100219726562,
|
| 402 |
+
"std_dimension_correlation": 0.029023808376502022,
|
| 403 |
+
"linear_cka": 0.984375
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 0.24609375,
|
| 407 |
+
"mean_cosine_similarity": 0.9375,
|
| 408 |
+
"std_cosine_similarity": 0.1591796875,
|
| 409 |
+
"mean_l2_distance": 12.625,
|
| 410 |
+
"std_l2_distance": 13.375,
|
| 411 |
+
"mean_dimension_correlation": 0.9256805419921875,
|
| 412 |
+
"std_dimension_correlation": 0.030472261122601613,
|
| 413 |
+
"linear_cka": 0.984375
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 0.3045572916666667,
|
| 416 |
+
"std_mse": 0.061728089131668586,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.9216145833333333,
|
| 418 |
+
"std_mean_cosine_similarity": 0.01921444452676741,
|
| 419 |
+
"avg_std_cosine_similarity": 0.156640625,
|
| 420 |
+
"std_std_cosine_similarity": 0.003999537123283247,
|
| 421 |
+
"avg_mean_l2_distance": 15.608333333333333,
|
| 422 |
+
"std_mean_l2_distance": 3.137845819808374,
|
| 423 |
+
"avg_std_l2_distance": 12.7875,
|
| 424 |
+
"std_std_l2_distance": 0.34746102898982306,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.9121468607584635,
|
| 426 |
+
"std_mean_dimension_correlation": 0.016489903679962933,
|
| 427 |
+
"avg_std_dimension_correlation": 0.03143026060381273,
|
| 428 |
+
"std_std_dimension_correlation": 0.0023529554482308356,
|
| 429 |
+
"avg_linear_cka": 0.9778645833333334,
|
| 430 |
+
"std_linear_cka": 0.009207119546699838
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/metrics_tokens_9004032.json
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1,
|
| 3 |
+
"n_tokens": 9004032,
|
| 4 |
+
"global_step": 8793,
|
| 5 |
+
"training_metrics": {
|
| 6 |
+
"train/loss": 2.546875,
|
| 7 |
+
"train/contrastive": 2.453125,
|
| 8 |
+
"train/recons_loss": 0.56640625,
|
| 9 |
+
"train/balance_loss": 3.84375,
|
| 10 |
+
"train/balance_loss_contrastive": 2.84375,
|
| 11 |
+
"train/balance_loss_recons": 1.0078125,
|
| 12 |
+
"train/contrastive_std": 3.359375,
|
| 13 |
+
"train/recons_std": 0.06787109375,
|
| 14 |
+
"train/contrastive_min": 0.08935546875,
|
| 15 |
+
"train/contrastive_max": 7.125,
|
| 16 |
+
"train/recons_min": 0.482421875,
|
| 17 |
+
"train/recons_max": 0.65234375,
|
| 18 |
+
"train/Qwen3_0.6B_layer_2": 0.640625,
|
| 19 |
+
"train/Qwen3_0.6B_layer_4": 0.546875,
|
| 20 |
+
"train/Qwen3_1.7B_layer_2": 0.515625,
|
| 21 |
+
"train/Qwen3_1.7B_layer_4": 0.65234375,
|
| 22 |
+
"train/Qwen3_4B_layer_2": 0.482421875,
|
| 23 |
+
"train/Qwen3_4B_layer_4": 0.5625,
|
| 24 |
+
"train/contrastives": null,
|
| 25 |
+
"train/epoch": 1,
|
| 26 |
+
"train/n_tokens": 9004032,
|
| 27 |
+
"train/step": 8793
|
| 28 |
+
},
|
| 29 |
+
"eval_metrics": {
|
| 30 |
+
"global_step": 8793,
|
| 31 |
+
"n_tokens": 9004032,
|
| 32 |
+
"kl_divergence": {
|
| 33 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.006870746612549,
|
| 34 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.394875526428223,
|
| 35 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 5.912027359008789,
|
| 36 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.115749359130859,
|
| 37 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.151121616363525,
|
| 38 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.175347805023193,
|
| 39 |
+
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
|
| 40 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.124427318572998,
|
| 41 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.121898651123047,
|
| 42 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.1416893005371094,
|
| 43 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.1991913318634033,
|
| 44 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.14923357963562,
|
| 45 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.216580390930176,
|
| 46 |
+
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
|
| 47 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.9164533615112305,
|
| 48 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.773134708404541,
|
| 49 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.090576171875,
|
| 50 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.982679843902588,
|
| 51 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.148589134216309,
|
| 52 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.272560119628906,
|
| 53 |
+
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
|
| 54 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.3844406604766846,
|
| 55 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.425341844558716,
|
| 56 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.334113597869873,
|
| 57 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.3682360649108887,
|
| 58 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.3439788818359375,
|
| 59 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.5122714042663574,
|
| 60 |
+
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
|
| 61 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4646520614624023,
|
| 62 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.1960129737854004,
|
| 63 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.9887456893920898,
|
| 64 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.074134111404419,
|
| 65 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.142500638961792,
|
| 66 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.9460818767547607,
|
| 67 |
+
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
|
| 68 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.4264042377471924,
|
| 69 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.065612554550171,
|
| 70 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.84149169921875,
|
| 71 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0016493797302246,
|
| 72 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 2.982909679412842,
|
| 73 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7882883548736572,
|
| 74 |
+
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
|
| 75 |
+
},
|
| 76 |
+
"mae_hidden_states": {
|
| 77 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.144361138343811,
|
| 78 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.1407876014709473,
|
| 79 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.1702628135681152,
|
| 80 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.169557809829712,
|
| 81 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.164795160293579,
|
| 82 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.165663480758667,
|
| 83 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 0.9478356242179871,
|
| 84 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9305350184440613,
|
| 85 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9448918104171753,
|
| 86 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9919092059135437,
|
| 87 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9386879801750183,
|
| 88 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9315637350082397,
|
| 89 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 0.9601666331291199,
|
| 90 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.8851673007011414,
|
| 91 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.8906123042106628,
|
| 92 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.8979656100273132,
|
| 93 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.8988674283027649,
|
| 94 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.900534451007843,
|
| 95 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.154961109161377,
|
| 96 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.1417714357376099,
|
| 97 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.147143840789795,
|
| 98 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1556771993637085,
|
| 99 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.14786696434021,
|
| 100 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.148809790611267,
|
| 101 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 0.9560009837150574,
|
| 102 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9207914471626282,
|
| 103 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9233508110046387,
|
| 104 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.93439781665802,
|
| 105 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.894271194934845,
|
| 106 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9094542264938354,
|
| 107 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.068742275238037,
|
| 108 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0256458520889282,
|
| 109 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0398327112197876,
|
| 110 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.056915283203125,
|
| 111 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0318653583526611,
|
| 112 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.016662359237671
|
| 113 |
+
},
|
| 114 |
+
"alignment": {
|
| 115 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 116 |
+
"mse": 0.328125,
|
| 117 |
+
"mean_cosine_similarity": 0.921875,
|
| 118 |
+
"std_cosine_similarity": 0.1474609375,
|
| 119 |
+
"mean_l2_distance": 16.875,
|
| 120 |
+
"std_l2_distance": 11.6875,
|
| 121 |
+
"mean_dimension_correlation": 0.911785888671875,
|
| 122 |
+
"std_dimension_correlation": 0.029143728520497736,
|
| 123 |
+
"linear_cka": 0.97265625
|
| 124 |
+
},
|
| 125 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 126 |
+
"mse": 0.333984375,
|
| 127 |
+
"mean_cosine_similarity": 0.91796875,
|
| 128 |
+
"std_cosine_similarity": 0.1533203125,
|
| 129 |
+
"mean_l2_distance": 17.0,
|
| 130 |
+
"std_l2_distance": 12.0,
|
| 131 |
+
"mean_dimension_correlation": 0.908795166015625,
|
| 132 |
+
"std_dimension_correlation": 0.030533706065498583,
|
| 133 |
+
"linear_cka": 0.97265625
|
| 134 |
+
},
|
| 135 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 136 |
+
"mse": 0.333984375,
|
| 137 |
+
"mean_cosine_similarity": 0.921875,
|
| 138 |
+
"std_cosine_similarity": 0.1455078125,
|
| 139 |
+
"mean_l2_distance": 17.0,
|
| 140 |
+
"std_l2_distance": 11.4375,
|
| 141 |
+
"mean_dimension_correlation": 0.9114944458007812,
|
| 142 |
+
"std_dimension_correlation": 0.02791911734622361,
|
| 143 |
+
"linear_cka": 0.97265625
|
| 144 |
+
},
|
| 145 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
|
| 146 |
+
"mse": 0.333984375,
|
| 147 |
+
"mean_cosine_similarity": 0.91796875,
|
| 148 |
+
"std_cosine_similarity": 0.150390625,
|
| 149 |
+
"mean_l2_distance": 17.0,
|
| 150 |
+
"std_l2_distance": 11.8125,
|
| 151 |
+
"mean_dimension_correlation": 0.910113525390625,
|
| 152 |
+
"std_dimension_correlation": 0.029502623650495614,
|
| 153 |
+
"linear_cka": 0.97265625
|
| 154 |
+
},
|
| 155 |
+
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
|
| 156 |
+
"mse": 0.330078125,
|
| 157 |
+
"mean_cosine_similarity": 0.921875,
|
| 158 |
+
"std_cosine_similarity": 0.1513671875,
|
| 159 |
+
"mean_l2_distance": 17.0,
|
| 160 |
+
"std_l2_distance": 11.75,
|
| 161 |
+
"mean_dimension_correlation": 0.910638427734375,
|
| 162 |
+
"std_dimension_correlation": 0.029250348976490634,
|
| 163 |
+
"linear_cka": 0.97265625
|
| 164 |
+
},
|
| 165 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 166 |
+
"mse": 0.328125,
|
| 167 |
+
"mean_cosine_similarity": 0.921875,
|
| 168 |
+
"std_cosine_similarity": 0.1474609375,
|
| 169 |
+
"mean_l2_distance": 16.875,
|
| 170 |
+
"std_l2_distance": 11.6875,
|
| 171 |
+
"mean_dimension_correlation": 0.9118026733398438,
|
| 172 |
+
"std_dimension_correlation": 0.02911178290188815,
|
| 173 |
+
"linear_cka": 0.97265625
|
| 174 |
+
},
|
| 175 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 176 |
+
"mse": 0.259765625,
|
| 177 |
+
"mean_cosine_similarity": 0.9375,
|
| 178 |
+
"std_cosine_similarity": 0.1484375,
|
| 179 |
+
"mean_l2_distance": 13.3125,
|
| 180 |
+
"std_l2_distance": 12.4375,
|
| 181 |
+
"mean_dimension_correlation": 0.9281707763671875,
|
| 182 |
+
"std_dimension_correlation": 0.027613594267907524,
|
| 183 |
+
"linear_cka": 0.984375
|
| 184 |
+
},
|
| 185 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 186 |
+
"mse": 0.25390625,
|
| 187 |
+
"mean_cosine_similarity": 0.94140625,
|
| 188 |
+
"std_cosine_similarity": 0.1474609375,
|
| 189 |
+
"mean_l2_distance": 13.0625,
|
| 190 |
+
"std_l2_distance": 12.375,
|
| 191 |
+
"mean_dimension_correlation": 0.929296875,
|
| 192 |
+
"std_dimension_correlation": 0.027428457660098507,
|
| 193 |
+
"linear_cka": 0.984375
|
| 194 |
+
},
|
| 195 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
|
| 196 |
+
"mse": 0.26171875,
|
| 197 |
+
"mean_cosine_similarity": 0.9375,
|
| 198 |
+
"std_cosine_similarity": 0.1572265625,
|
| 199 |
+
"mean_l2_distance": 13.375,
|
| 200 |
+
"std_l2_distance": 12.8125,
|
| 201 |
+
"mean_dimension_correlation": 0.9258895874023437,
|
| 202 |
+
"std_dimension_correlation": 0.02807925327640673,
|
| 203 |
+
"linear_cka": 0.984375
|
| 204 |
+
},
|
| 205 |
+
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
|
| 206 |
+
"mse": 0.2578125,
|
| 207 |
+
"mean_cosine_similarity": 0.9375,
|
| 208 |
+
"std_cosine_similarity": 0.15625,
|
| 209 |
+
"mean_l2_distance": 13.1875,
|
| 210 |
+
"std_l2_distance": 12.875,
|
| 211 |
+
"mean_dimension_correlation": 0.9261764526367188,
|
| 212 |
+
"std_dimension_correlation": 0.029308957111961503,
|
| 213 |
+
"linear_cka": 0.984375
|
| 214 |
+
},
|
| 215 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 216 |
+
"mse": 0.333984375,
|
| 217 |
+
"mean_cosine_similarity": 0.91796875,
|
| 218 |
+
"std_cosine_similarity": 0.1533203125,
|
| 219 |
+
"mean_l2_distance": 17.0,
|
| 220 |
+
"std_l2_distance": 12.0,
|
| 221 |
+
"mean_dimension_correlation": 0.9088623046875,
|
| 222 |
+
"std_dimension_correlation": 0.030521200956836466,
|
| 223 |
+
"linear_cka": 0.97265625
|
| 224 |
+
},
|
| 225 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 226 |
+
"mse": 0.259765625,
|
| 227 |
+
"mean_cosine_similarity": 0.9375,
|
| 228 |
+
"std_cosine_similarity": 0.1484375,
|
| 229 |
+
"mean_l2_distance": 13.3125,
|
| 230 |
+
"std_l2_distance": 12.4375,
|
| 231 |
+
"mean_dimension_correlation": 0.9282363891601563,
|
| 232 |
+
"std_dimension_correlation": 0.02761614875613791,
|
| 233 |
+
"linear_cka": 0.984375
|
| 234 |
+
},
|
| 235 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 236 |
+
"mse": 0.248046875,
|
| 237 |
+
"mean_cosine_similarity": 0.9375,
|
| 238 |
+
"std_cosine_similarity": 0.15625,
|
| 239 |
+
"mean_l2_distance": 12.6875,
|
| 240 |
+
"std_l2_distance": 13.0,
|
| 241 |
+
"mean_dimension_correlation": 0.9286865234375,
|
| 242 |
+
"std_dimension_correlation": 0.028394499325967187,
|
| 243 |
+
"linear_cka": 1.0
|
| 244 |
+
},
|
| 245 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
|
| 246 |
+
"mse": 0.251953125,
|
| 247 |
+
"mean_cosine_similarity": 0.9375,
|
| 248 |
+
"std_cosine_similarity": 0.1572265625,
|
| 249 |
+
"mean_l2_distance": 12.875,
|
| 250 |
+
"std_l2_distance": 13.0,
|
| 251 |
+
"mean_dimension_correlation": 0.9273910522460938,
|
| 252 |
+
"std_dimension_correlation": 0.029792982191153054,
|
| 253 |
+
"linear_cka": 1.0
|
| 254 |
+
},
|
| 255 |
+
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
|
| 256 |
+
"mse": 0.2451171875,
|
| 257 |
+
"mean_cosine_similarity": 0.94140625,
|
| 258 |
+
"std_cosine_similarity": 0.146484375,
|
| 259 |
+
"mean_l2_distance": 12.5625,
|
| 260 |
+
"std_l2_distance": 12.4375,
|
| 261 |
+
"mean_dimension_correlation": 0.9318832397460938,
|
| 262 |
+
"std_dimension_correlation": 0.02779797256144542,
|
| 263 |
+
"linear_cka": 0.984375
|
| 264 |
+
},
|
| 265 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 266 |
+
"mse": 0.333984375,
|
| 267 |
+
"mean_cosine_similarity": 0.921875,
|
| 268 |
+
"std_cosine_similarity": 0.1455078125,
|
| 269 |
+
"mean_l2_distance": 17.0,
|
| 270 |
+
"std_l2_distance": 11.4375,
|
| 271 |
+
"mean_dimension_correlation": 0.9115066528320312,
|
| 272 |
+
"std_dimension_correlation": 0.02783942438110558,
|
| 273 |
+
"linear_cka": 0.97265625
|
| 274 |
+
},
|
| 275 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 276 |
+
"mse": 0.25390625,
|
| 277 |
+
"mean_cosine_similarity": 0.94140625,
|
| 278 |
+
"std_cosine_similarity": 0.1474609375,
|
| 279 |
+
"mean_l2_distance": 13.0625,
|
| 280 |
+
"std_l2_distance": 12.375,
|
| 281 |
+
"mean_dimension_correlation": 0.929364013671875,
|
| 282 |
+
"std_dimension_correlation": 0.027418246966595963,
|
| 283 |
+
"linear_cka": 0.984375
|
| 284 |
+
},
|
| 285 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 286 |
+
"mse": 0.248046875,
|
| 287 |
+
"mean_cosine_similarity": 0.9375,
|
| 288 |
+
"std_cosine_similarity": 0.15625,
|
| 289 |
+
"mean_l2_distance": 12.6875,
|
| 290 |
+
"std_l2_distance": 13.0,
|
| 291 |
+
"mean_dimension_correlation": 0.9286041259765625,
|
| 292 |
+
"std_dimension_correlation": 0.028414978282929146,
|
| 293 |
+
"linear_cka": 1.0
|
| 294 |
+
},
|
| 295 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
|
| 296 |
+
"mse": 0.25390625,
|
| 297 |
+
"mean_cosine_similarity": 0.9375,
|
| 298 |
+
"std_cosine_similarity": 0.154296875,
|
| 299 |
+
"mean_l2_distance": 13.0,
|
| 300 |
+
"std_l2_distance": 12.8125,
|
| 301 |
+
"mean_dimension_correlation": 0.927911376953125,
|
| 302 |
+
"std_dimension_correlation": 0.027630500633115,
|
| 303 |
+
"linear_cka": 0.984375
|
| 304 |
+
},
|
| 305 |
+
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
|
| 306 |
+
"mse": 0.24609375,
|
| 307 |
+
"mean_cosine_similarity": 0.94140625,
|
| 308 |
+
"std_cosine_similarity": 0.1513671875,
|
| 309 |
+
"mean_l2_distance": 12.6875,
|
| 310 |
+
"std_l2_distance": 12.625,
|
| 311 |
+
"mean_dimension_correlation": 0.9304595947265625,
|
| 312 |
+
"std_dimension_correlation": 0.027486156310205404,
|
| 313 |
+
"linear_cka": 0.984375
|
| 314 |
+
},
|
| 315 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
|
| 316 |
+
"mse": 0.333984375,
|
| 317 |
+
"mean_cosine_similarity": 0.91796875,
|
| 318 |
+
"std_cosine_similarity": 0.150390625,
|
| 319 |
+
"mean_l2_distance": 17.0,
|
| 320 |
+
"std_l2_distance": 11.8125,
|
| 321 |
+
"mean_dimension_correlation": 0.910162353515625,
|
| 322 |
+
"std_dimension_correlation": 0.029493359043523553,
|
| 323 |
+
"linear_cka": 0.97265625
|
| 324 |
+
},
|
| 325 |
+
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
|
| 326 |
+
"mse": 0.26171875,
|
| 327 |
+
"mean_cosine_similarity": 0.9375,
|
| 328 |
+
"std_cosine_similarity": 0.1572265625,
|
| 329 |
+
"mean_l2_distance": 13.375,
|
| 330 |
+
"std_l2_distance": 12.8125,
|
| 331 |
+
"mean_dimension_correlation": 0.9259017944335938,
|
| 332 |
+
"std_dimension_correlation": 0.02807177297712858,
|
| 333 |
+
"linear_cka": 0.984375
|
| 334 |
+
},
|
| 335 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
|
| 336 |
+
"mse": 0.251953125,
|
| 337 |
+
"mean_cosine_similarity": 0.9375,
|
| 338 |
+
"std_cosine_similarity": 0.1572265625,
|
| 339 |
+
"mean_l2_distance": 12.875,
|
| 340 |
+
"std_l2_distance": 13.0,
|
| 341 |
+
"mean_dimension_correlation": 0.9274307250976562,
|
| 342 |
+
"std_dimension_correlation": 0.029833198285711512,
|
| 343 |
+
"linear_cka": 1.0
|
| 344 |
+
},
|
| 345 |
+
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
|
| 346 |
+
"mse": 0.25390625,
|
| 347 |
+
"mean_cosine_similarity": 0.9375,
|
| 348 |
+
"std_cosine_similarity": 0.154296875,
|
| 349 |
+
"mean_l2_distance": 13.0,
|
| 350 |
+
"std_l2_distance": 12.8125,
|
| 351 |
+
"mean_dimension_correlation": 0.9279266357421875,
|
| 352 |
+
"std_dimension_correlation": 0.027642045164903522,
|
| 353 |
+
"linear_cka": 0.984375
|
| 354 |
+
},
|
| 355 |
+
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
|
| 356 |
+
"mse": 0.236328125,
|
| 357 |
+
"mean_cosine_similarity": 0.94140625,
|
| 358 |
+
"std_cosine_similarity": 0.158203125,
|
| 359 |
+
"mean_l2_distance": 12.125,
|
| 360 |
+
"std_l2_distance": 13.25,
|
| 361 |
+
"mean_dimension_correlation": 0.93009033203125,
|
| 362 |
+
"std_dimension_correlation": 0.029206890514525005,
|
| 363 |
+
"linear_cka": 1.0
|
| 364 |
+
},
|
| 365 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
|
| 366 |
+
"mse": 0.330078125,
|
| 367 |
+
"mean_cosine_similarity": 0.921875,
|
| 368 |
+
"std_cosine_similarity": 0.1513671875,
|
| 369 |
+
"mean_l2_distance": 17.0,
|
| 370 |
+
"std_l2_distance": 11.75,
|
| 371 |
+
"mean_dimension_correlation": 0.910687255859375,
|
| 372 |
+
"std_dimension_correlation": 0.02925704219094372,
|
| 373 |
+
"linear_cka": 0.97265625
|
| 374 |
+
},
|
| 375 |
+
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
|
| 376 |
+
"mse": 0.2578125,
|
| 377 |
+
"mean_cosine_similarity": 0.9375,
|
| 378 |
+
"std_cosine_similarity": 0.15625,
|
| 379 |
+
"mean_l2_distance": 13.1875,
|
| 380 |
+
"std_l2_distance": 12.875,
|
| 381 |
+
"mean_dimension_correlation": 0.9263031005859375,
|
| 382 |
+
"std_dimension_correlation": 0.0292820509917565,
|
| 383 |
+
"linear_cka": 0.984375
|
| 384 |
+
},
|
| 385 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
|
| 386 |
+
"mse": 0.2451171875,
|
| 387 |
+
"mean_cosine_similarity": 0.94140625,
|
| 388 |
+
"std_cosine_similarity": 0.146484375,
|
| 389 |
+
"mean_l2_distance": 12.5625,
|
| 390 |
+
"std_l2_distance": 12.4375,
|
| 391 |
+
"mean_dimension_correlation": 0.9319442749023438,
|
| 392 |
+
"std_dimension_correlation": 0.02774844077379742,
|
| 393 |
+
"linear_cka": 0.984375
|
| 394 |
+
},
|
| 395 |
+
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
|
| 396 |
+
"mse": 0.24609375,
|
| 397 |
+
"mean_cosine_similarity": 0.94140625,
|
| 398 |
+
"std_cosine_similarity": 0.1513671875,
|
| 399 |
+
"mean_l2_distance": 12.6875,
|
| 400 |
+
"std_l2_distance": 12.625,
|
| 401 |
+
"mean_dimension_correlation": 0.9305908203125,
|
| 402 |
+
"std_dimension_correlation": 0.027491914687628318,
|
| 403 |
+
"linear_cka": 0.984375
|
| 404 |
+
},
|
| 405 |
+
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
|
| 406 |
+
"mse": 0.236328125,
|
| 407 |
+
"mean_cosine_similarity": 0.94140625,
|
| 408 |
+
"std_cosine_similarity": 0.158203125,
|
| 409 |
+
"mean_l2_distance": 12.125,
|
| 410 |
+
"std_l2_distance": 13.25,
|
| 411 |
+
"mean_dimension_correlation": 0.9300765991210938,
|
| 412 |
+
"std_dimension_correlation": 0.029167152542476576,
|
| 413 |
+
"linear_cka": 1.0
|
| 414 |
+
},
|
| 415 |
+
"avg_mse": 0.2783203125,
|
| 416 |
+
"std_mse": 0.03847375333442295,
|
| 417 |
+
"avg_mean_cosine_similarity": 0.9328125,
|
| 418 |
+
"std_mean_cosine_similarity": 0.009043622580304863,
|
| 419 |
+
"avg_std_cosine_similarity": 0.15208333333333332,
|
| 420 |
+
"std_std_cosine_similarity": 0.004211187924165684,
|
| 421 |
+
"avg_mean_l2_distance": 14.25,
|
| 422 |
+
"std_mean_l2_distance": 1.9497596005661826,
|
| 423 |
+
"avg_std_l2_distance": 12.420833333333333,
|
| 424 |
+
"std_std_l2_distance": 0.5426913385054979,
|
| 425 |
+
"avg_mean_dimension_correlation": 0.9226060994466144,
|
| 426 |
+
"std_mean_dimension_correlation": 0.00864781898546122,
|
| 427 |
+
"avg_std_dimension_correlation": 0.02859991824384515,
|
| 428 |
+
"std_std_dimension_correlation": 0.00095096984819575,
|
| 429 |
+
"avg_linear_cka": 0.98359375,
|
| 430 |
+
"std_linear_cka": 0.009695057535930357
|
| 431 |
+
}
|
| 432 |
+
}
|
| 433 |
+
}
|
evaluation/plots/kl_divergences_step_1954_tokens_2000896.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_2931_tokens_3001344.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_3908_tokens_4001792.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_4885_tokens_5002240.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_5862_tokens_6002688.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_6839_tokens_7003136.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_7816_tokens_8003584.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_8793_tokens_9004032.png
ADDED
|
Git LFS Details
|
evaluation/plots/kl_divergences_step_977_tokens_1000448.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_1954_tokens_2000896.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_2931_tokens_3001344.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_3908_tokens_4001792.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_4885_tokens_5002240.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_5862_tokens_6002688.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_6839_tokens_7003136.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_7816_tokens_8003584.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_8793_tokens_9004032.png
ADDED
|
Git LFS Details
|
evaluation/plots/mae_hidden_states_step_977_tokens_1000448.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_1954_tokens_2000896.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_2931_tokens_3001344.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_3908_tokens_4001792.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_4885_tokens_5002240.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_5862_tokens_6002688.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_6839_tokens_7003136.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_7816_tokens_8003584.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_8793_tokens_9004032.png
ADDED
|
Git LFS Details
|
evaluation/plots/multi_dataset_alignment_step_977_tokens_1000448.png
ADDED
|
Git LFS Details
|
metrics_tokens_1000448.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train/loss": 0.3984375,
|
| 3 |
+
"train/contrastive": 0.310546875,
|
| 4 |
+
"train/recons_loss": 0.68359375,
|
| 5 |
+
"train/balance_loss": 2.015625,
|
| 6 |
+
"train/balance_loss_contrastive": 1.0,
|
| 7 |
+
"train/balance_loss_recons": 1.015625,
|
| 8 |
+
"train/contrastive_std": 0.015625,
|
| 9 |
+
"train/recons_std": 0.0888671875,
|
| 10 |
+
"train/contrastive_min": 0.27734375,
|
| 11 |
+
"train/contrastive_max": 0.3359375,
|
| 12 |
+
"train/recons_min": 0.58203125,
|
| 13 |
+
"train/recons_max": 0.84375,
|
| 14 |
+
"train/Qwen3_0.6B_layer_2": 0.58203125,
|
| 15 |
+
"train/Qwen3_0.6B_layer_4": 0.69140625,
|
| 16 |
+
"train/Qwen3_1.7B_layer_2": 0.65234375,
|
| 17 |
+
"train/Qwen3_1.7B_layer_4": 0.84375,
|
| 18 |
+
"train/Qwen3_4B_layer_2": 0.63671875,
|
| 19 |
+
"train/Qwen3_4B_layer_4": 0.6953125,
|
| 20 |
+
"train/contrastives": null,
|
| 21 |
+
"train/epoch": 1,
|
| 22 |
+
"train/n_tokens": 1000448,
|
| 23 |
+
"train/step": 977
|
| 24 |
+
}
|
metrics_tokens_2000896.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train/loss": 2.609375,
|
| 3 |
+
"train/contrastive": 2.5,
|
| 4 |
+
"train/recons_loss": 0.71484375,
|
| 5 |
+
"train/balance_loss": 3.671875,
|
| 6 |
+
"train/balance_loss_contrastive": 2.625,
|
| 7 |
+
"train/balance_loss_recons": 1.046875,
|
| 8 |
+
"train/contrastive_std": 3.234375,
|
| 9 |
+
"train/recons_std": 0.171875,
|
| 10 |
+
"train/contrastive_min": 0.224609375,
|
| 11 |
+
"train/contrastive_max": 7.0,
|
| 12 |
+
"train/recons_min": 0.5859375,
|
| 13 |
+
"train/recons_max": 1.046875,
|
| 14 |
+
"train/Qwen3_0.6B_layer_2": 1.046875,
|
| 15 |
+
"train/Qwen3_0.6B_layer_4": 0.63671875,
|
| 16 |
+
"train/Qwen3_1.7B_layer_2": 0.62109375,
|
| 17 |
+
"train/Qwen3_1.7B_layer_4": 0.75,
|
| 18 |
+
"train/Qwen3_4B_layer_2": 0.5859375,
|
| 19 |
+
"train/Qwen3_4B_layer_4": 0.65234375,
|
| 20 |
+
"train/contrastives": null,
|
| 21 |
+
"train/epoch": 1,
|
| 22 |
+
"train/n_tokens": 2000896,
|
| 23 |
+
"train/step": 1954
|
| 24 |
+
}
|
metrics_tokens_3001344.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train/loss": 2.515625,
|
| 3 |
+
"train/contrastive": 2.421875,
|
| 4 |
+
"train/recons_loss": 0.671875,
|
| 5 |
+
"train/balance_loss": 3.75,
|
| 6 |
+
"train/balance_loss_contrastive": 2.71875,
|
| 7 |
+
"train/balance_loss_recons": 1.0390625,
|
| 8 |
+
"train/contrastive_std": 3.25,
|
| 9 |
+
"train/recons_std": 0.138671875,
|
| 10 |
+
"train/contrastive_min": 0.146484375,
|
| 11 |
+
"train/contrastive_max": 6.9375,
|
| 12 |
+
"train/recons_min": 0.56640625,
|
| 13 |
+
"train/recons_max": 0.9375,
|
| 14 |
+
"train/Qwen3_0.6B_layer_2": 0.9375,
|
| 15 |
+
"train/Qwen3_0.6B_layer_4": 0.59765625,
|
| 16 |
+
"train/Qwen3_1.7B_layer_2": 0.59375,
|
| 17 |
+
"train/Qwen3_1.7B_layer_4": 0.703125,
|
| 18 |
+
"train/Qwen3_4B_layer_2": 0.56640625,
|
| 19 |
+
"train/Qwen3_4B_layer_4": 0.6328125,
|
| 20 |
+
"train/contrastives": null,
|
| 21 |
+
"train/epoch": 1,
|
| 22 |
+
"train/n_tokens": 3001344,
|
| 23 |
+
"train/step": 2931
|
| 24 |
+
}
|