bdbj commited on 11 days ago

Commit

d7b2a5b

verified ·

1 Parent(s): 3d431a5

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
config.json +2885 -0
generation_config.json +9 -0
lib/__init__.py +1 -0
lib/__pycache__/__init__.cpython-311.pyc +0 -0
lib/__pycache__/config.cpython-311.pyc +0 -0
lib/algo/__init__.py +0 -0
lib/algo/__pycache__/__init__.cpython-311.pyc +0 -0
lib/algo/__pycache__/ldlq.cpython-311.pyc +0 -0
lib/algo/ldlq.py +203 -0
lib/algo/ldlq_beam_cd.py +209 -0
lib/codebook/__pycache__/bitshift.cpython-311.pyc +0 -0
lib/codebook/__pycache__/vq_codebook.cpython-311.pyc +0 -0
lib/codebook/bitshift.py +486 -0
lib/codebook/vq_codebook.py +56 -0
lib/config.py +6 -0
lib/linear/__init__.py +430 -0
lib/linear/__pycache__/__init__.cpython-311.pyc +0 -0
lib/linear/__pycache__/comb_linear.cpython-311.pyc +0 -0
lib/linear/__pycache__/incoherent_linear.cpython-311.pyc +0 -0
lib/linear/__pycache__/quantized_linear.cpython-311.pyc +0 -0
lib/linear/__pycache__/tcq_linear.cpython-311.pyc +0 -0
lib/linear/__pycache__/vq_linear.cpython-311.pyc +0 -0
lib/linear/comb_linear.py +325 -0
lib/linear/incoherent_linear.py +639 -0
lib/linear/quantized_linear.py +154 -0
lib/linear/rotation.py +16 -0
lib/linear/tcq_linear.py +122 -0
lib/linear/vq_linear.py +208 -0
lib/quantizer/__pycache__/comb_quant.cpython-311.pyc +0 -0
lib/quantizer/__pycache__/nuq_op.cpython-311.pyc +0 -0
lib/quantizer/__pycache__/pack_op.cpython-311.pyc +0 -0
lib/quantizer/__pycache__/pack_op.general_pack_32-88.py311.1.nbc +0 -0
lib/quantizer/__pycache__/pack_op.general_pack_32-88.py311.nbi +0 -0
lib/quantizer/__pycache__/pack_op.pack_32-242.py311.1.nbc +0 -0
lib/quantizer/__pycache__/pack_op.pack_32-242.py311.nbi +0 -0
lib/quantizer/__pycache__/pack_op.pack_codes_32-186.py311.1.nbc +3 -0
lib/quantizer/__pycache__/pack_op.pack_codes_32-186.py311.nbi +0 -0
lib/quantizer/__pycache__/pack_op.pack_for_sq_pack_kernel-287.py311.1.nbc +0 -0
lib/quantizer/__pycache__/pack_op.pack_for_sq_pack_kernel-287.py311.nbi +0 -0
lib/quantizer/__pycache__/quant_op.cpython-311.pyc +0 -0
lib/quantizer/__pycache__/tcq_quant.cpython-311.pyc +0 -0
lib/quantizer/__pycache__/vq_quant.cpython-311.pyc +0 -0
lib/quantizer/__pycache__/vq_quant_ldlq.cpython-311.pyc +0 -0
lib/quantizer/comb_quant.py +201 -0
lib/quantizer/nuq_op.py +431 -0
lib/quantizer/pack_op.py +335 -0
lib/quantizer/quant_op.py +277 -0
lib/quantizer/tcq_quant.py +160 -0
lib/quantizer/vq_quant.py +149 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+lib/quantizer/__pycache__/pack_op.pack_codes_32-186.py311.1.nbc filter=lfs diff=lfs merge=lfs -text
+lib/utils/__pycache__/matmul_had.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,2885 @@

+{
+  "_name_or_path": "meta-llama/Llama-3.2-1B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModelForCausalLM": "qpal_modelling_llama.QPalLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "qpal_quant_config": {
+    "modules": {
+      "model.layers.0.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.0.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 5,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.0.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.0.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.0.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.0.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 4,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.0.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.1.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 10,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.1.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 5,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.1.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 5,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.1.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.1.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.1.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.1.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.10.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.10.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.10.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.10.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.10.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 8,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.10.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 8,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.10.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.11.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.11.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.11.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.11.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.11.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.11.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.11.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.12.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.12.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.12.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.12.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.12.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.12.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.12.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.13.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.13.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.13.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.13.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 8,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.13.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.13.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 8,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.13.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.14.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.14.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.14.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.14.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.14.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.14.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.14.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.15.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "in_part": [
+            4096,
+            4096
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.15.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.15.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.15.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.15.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.15.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.15.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.2.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.2.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 5,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.2.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.2.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.2.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.2.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.2.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.3.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.3.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 5,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.3.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.3.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.3.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.3.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.3.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 10,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.4.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.4.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.4.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.4.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.4.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 8,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.4.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.4.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.5.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.5.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.5.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.5.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.5.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.5.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 8,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.5.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.6.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.6.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.6.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.6.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.6.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.6.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.6.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            9,
+            10
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.7.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": [
+            6,
+            7
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "in_part": [
+            4096,
+            4096
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.7.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.7.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.7.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.7.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.7.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.7.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 10,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.8.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.8.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.8.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.8.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.8.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.8.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.8.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 10,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.9.mlp.down_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 8192,
+        "hadV": 2048,
+        "in_features": 8192,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 8192,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.9.mlp.gate_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 6,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.9.mlp.up_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 8192,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 8192,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 8192,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.9.self_attn.k_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 9,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.9.self_attn.o_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": [
+            8,
+            9
+          ],
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "in_part": [
+            1024,
+            1024
+          ],
+          "linear_cls": "CombtLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 10
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.9.self_attn.q_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 2048,
+        "in_features": 2048,
+        "linear": {
+          "KV": 7,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 2048,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 9
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 2048,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      },
+      "model.layers.9.self_attn.v_proj": {
+        "bias": false,
+        "dtype": "float32",
+        "hadU": 2048,
+        "hadV": 512,
+        "in_features": 2048,
+        "linear": {
+          "KV": 10,
+          "L": 16,
+          "V": 2,
+          "bias": false,
+          "in_features": 2048,
+          "linear_cls": "QTIPLinearTCQ",
+          "linear_dtype": "float32",
+          "out_features": 512,
+          "td_x": 16,
+          "td_y": 16,
+          "tlut_bits": 11
+        },
+        "module_type": "IncoherentLinear",
+        "out_features": 512,
+        "rot_info": "skip_r",
+        "scale": 32.0
+      }
+    }
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.45.2",
+  "use_cache": true,
+  "vocab_size": 128256
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.45.2"
+}

lib/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (158 Bytes). View file

lib/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (318 Bytes). View file

lib/algo/__init__.py ADDED Viewed

File without changes

lib/algo/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (163 Bytes). View file

lib/algo/__pycache__/ldlq.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

lib/algo/ldlq.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import copy
+import os
+import glog
+import torch
+from tqdm import tqdm
+import time
+from lib import utils
+_PERMUTE = torch.arange(256).reshape(2, 8, 2, 4, 2).permute(1, 3, 2, 0,
+                                                            4).flatten()
+_INV_PERMUTE = torch.zeros(256, dtype=torch.int64)
+_INV_PERMUTE[_PERMUTE] = torch.arange(256)
+def LDLQ_VQ(Wr, L, cb, buf_cols=128):
+    buf_cols = max(buf_cols, cb.vec_sz)
+    (m, n) = Wr.shape
+    assert buf_cols % cb.vec_sz == 0
+    assert n % buf_cols == 0
+    buf_size = buf_cols // cb.vec_sz
+    hatWr_T = torch.zeros(n, m, dtype=L.dtype, device=L.device)
+    Qidxs_T = torch.zeros(n // cb.vec_sz, m, dtype=cb.idx_dtype, device=L.device)
+    device = Wr.device
+    Wr = Wr.cpu()
+    utils.clean()
+    Wr_T = Wr.T.contiguous().to(device)
+    prod_cache = torch.zeros(n, m, dtype=Wr_T.dtype, device=Wr_T.device)
+    for cur_col in tqdm(range(n // cb.vec_sz, 0, -buf_size)):
+        b_Wr_T = Wr_T[cb.vec_sz * (cur_col - buf_size):cb.vec_sz * cur_col]
+        b_hatWr_T = hatWr_T[cb.vec_sz * (cur_col - buf_size):cb.vec_sz *
+                            cur_col]
+        b_L = L[cb.vec_sz * (cur_col - buf_size):cb.vec_sz *
+                cur_col].contiguous()
+        b_prod = prod_cache[cb.vec_sz * (cur_col - buf_size):cb.vec_sz *
+                            cur_col]
+        b_Qidxs_T = Qidxs_T[(cur_col - buf_size):cur_col]
+        L_offset = cb.vec_sz * (cur_col - buf_size)
+        for i in reversed(range(buf_size)):
+            WXWX = b_Wr_T[cb.vec_sz * i : cb.vec_sz * (i + 1)] + \
+                b_L[cb.vec_sz * (i + 1):, L_offset + cb.vec_sz * i : L_offset + cb.vec_sz * (i + 1)].T @ \
+                (b_Wr_T[cb.vec_sz * (i + 1):] - b_hatWr_T[cb.vec_sz * (i + 1):]) + \
+                b_prod[cb.vec_sz * i : cb.vec_sz * (i + 1)]
+            q_out = cb.quantize(WXWX.T)
+            b_hatWr_T[cb.vec_sz * i:cb.vec_sz * (i + 1)] = q_out[0].T
+            b_Qidxs_T[i:(i + 1)] = q_out[1].T
+        prod_cache += b_L.T @ (b_Wr_T - b_hatWr_T)
+        hatWr_T[cb.vec_sz * (cur_col - buf_size):cb.vec_sz *
+                cur_col] = b_hatWr_T
+    del b_Wr_T, b_hatWr_T, b_L, b_prod, L_offset, prod_cache
+    utils.clean()
+    return hatWr_T.T.contiguous(), Qidxs_T.T.contiguous()
+def LDLQ(Wr, L, cb, args, buf_cols=128, for_kernel=True):
+    if for_kernel:
+        assert args.td_x == 16 and args.td_y == 16
+    buf_cols = max(buf_cols, args.td_y)
+    trellissz = args.td_x * args.td_y
+    (m, n) = Wr.shape
+    assert buf_cols % args.td_y == 0
+    assert n % buf_cols == 0
+    assert args.td_y % args.V == 0
+    buf_size = buf_cols // args.td_y
+    hatWr_T = torch.zeros(n, m, dtype=L.dtype, device=L.device)
+    Qidxs_T = torch.zeros(n // args.V, m, dtype=cb.idx_dtype, device=L.device)
+    device = Wr.device
+    Wr = Wr.cpu()
+    utils.clean()
+    Wr_T = Wr.T.contiguous().to(device)
+    # quip
+    prod_cache = torch.zeros(n, m, dtype=Wr_T.dtype, device=Wr_T.device)
+    for cur_col in tqdm(range(n // args.td_y, 0, -buf_size)):
+        b_Wr_T = Wr_T[args.td_y * (cur_col - buf_size):args.td_y * cur_col]
+        b_hatWr_T = hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                            cur_col]
+        b_L = L[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col].contiguous()
+        b_prod = prod_cache[args.td_y * (cur_col - buf_size):args.td_y *
+                            cur_col]
+        b_Qidxs_T = Qidxs_T[args.td_y * (cur_col - buf_size) //
+                            args.V:args.td_y * cur_col // args.V]
+        L_offset = args.td_y * (cur_col - buf_size)
+        for i in reversed(range(buf_size)):
+            WXWX = b_Wr_T[args.td_y * i : args.td_y * (i + 1)] + \
+                b_L[args.td_y * (i + 1):, L_offset + args.td_y * i : L_offset + args.td_y * (i + 1)].T @ \
+                (b_Wr_T[args.td_y * (i + 1):] - b_hatWr_T[args.td_y * (i + 1):]) + \
+                b_prod[args.td_y * i : args.td_y * (i + 1)]
+            if trellissz > -1:
+                WXWXshape = WXWX.shape
+                thing = WXWX.T.reshape(-1, trellissz)
+                if for_kernel:
+                    thing = thing[..., _PERMUTE]
+                q_out = cb.quantize(thing)
+                if for_kernel:
+                    thing = q_out[0][..., _INV_PERMUTE].reshape(
+                        WXWXshape[1], WXWXshape[0])
+                else:
+                    thing = q_out[0].reshape(WXWXshape[1], WXWXshape[0])
+                idxs = q_out[1].reshape(WXWXshape[1], WXWXshape[0] // args.V)
+                b_hatWr_T[args.td_y * i:args.td_y * (i + 1)] = thing.T
+                b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V *
+                          (i + 1)] = idxs.T
+            else:
+                q_out = cb.quantize(WXWX.T)
+                b_hatWr_T[args.td_y * i:args.td_y * (i + 1)] = q_out[0].T
+                b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V *
+                          (i + 1)] = q_out[1].T
+        prod_cache += b_L.T @ (b_Wr_T - b_hatWr_T)
+        hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col] = b_hatWr_T
+    del b_Wr_T, b_hatWr_T, b_L, b_prod, L_offset, prod_cache
+    utils.clean()
+    return hatWr_T.T.contiguous(), Qidxs_T.T.contiguous()
+def LDLQ_combt(Wr, L, cb1, cb2, args, buf_cols=128, for_kernel=True):
+    if for_kernel:
+        assert args.td_x == 16 and args.td_y == 16
+    buf_cols = max(buf_cols, args.td_y)
+    trellissz = args.td_x * args.td_y
+    (m, n) = Wr.shape
+    assert buf_cols % args.td_y == 0
+    assert n % buf_cols == 0
+    assert args.td_y % args.V == 0
+    buf_size = buf_cols // args.td_y
+    hatWr_T = torch.zeros(n, m, dtype=L.dtype, device=L.device)
+    Qidxs_T = torch.zeros(n // args.V, m, dtype=cb1.idx_dtype, device=L.device)
+    device = Wr.device
+    Wr = Wr.cpu()
+    utils.clean()
+    Wr_T = Wr.T.contiguous().to(device)
+    # quip
+    prod_cache = torch.zeros(n, m, dtype=Wr_T.dtype, device=Wr_T.device)
+    flag_for_cb1_compile = True
+    for cur_col in tqdm(range(n // args.td_y, 0, -buf_size)):
+        b_Wr_T = Wr_T[args.td_y * (cur_col - buf_size):args.td_y * cur_col]
+        b_hatWr_T = hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                            cur_col]
+        b_L = L[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col].contiguous()
+        b_prod = prod_cache[args.td_y * (cur_col - buf_size):args.td_y *
+                            cur_col]
+        b_Qidxs_T = Qidxs_T[args.td_y * (cur_col - buf_size) //
+                            args.V:args.td_y * cur_col // args.V]
+        L_offset = args.td_y * (cur_col - buf_size)
+        for i in reversed(range(buf_size)):
+            WXWX = b_Wr_T[args.td_y * i : args.td_y * (i + 1)] + \
+                b_L[args.td_y * (i + 1):, L_offset + args.td_y * i : L_offset + args.td_y * (i + 1)].T @ \
+                (b_Wr_T[args.td_y * (i + 1):] - b_hatWr_T[args.td_y * (i + 1):]) + \
+                b_prod[args.td_y * i : args.td_y * (i + 1)]
+            if trellissz > -1:
+                WXWXshape = WXWX.shape
+                thing = WXWX.T.reshape(-1, trellissz)
+                if for_kernel:
+                    thing = thing[..., _PERMUTE]
+                if args.td_y * (cur_col - buf_size) >= n // 2:
+                    q_out = cb2.quantize(thing)
+                else:
+                    if flag_for_cb1_compile:
+                        torch._dynamo.reset()
+                        flag_for_cb1_compile = False
+                    q_out = cb1.quantize(thing)
+                if for_kernel:
+                    thing = q_out[0][..., _INV_PERMUTE].reshape(
+                        WXWXshape[1], WXWXshape[0])
+                else:
+                    thing = q_out[0].reshape(WXWXshape[1], WXWXshape[0])
+                idxs = q_out[1].reshape(WXWXshape[1], WXWXshape[0] // args.V)
+                b_hatWr_T[args.td_y * i:args.td_y * (i + 1)] = thing.T
+                b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V *
+                          (i + 1)] = idxs.T
+            else:
+                if args.td_y * (cur_col - buf_size) >= n // 2:
+                    q_out = cb2.quantize(WXWX.T)
+                else:
+                    q_out = cb1.quantize(WXWX.T)
+                b_hatWr_T[args.td_y * i:args.td_y * (i + 1)] = q_out[0].T
+                b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V *
+                          (i + 1)] = q_out[1].T
+        prod_cache += b_L.T @ (b_Wr_T - b_hatWr_T)
+        hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col] = b_hatWr_T
+    del b_Wr_T, b_hatWr_T, b_L, b_prod, L_offset, prod_cache
+    utils.clean()
+    return hatWr_T.T.contiguous(), Qidxs_T.T.contiguous()

lib/algo/ldlq_beam_cd.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import copy
+import os
+import glog
+import torch
+from tqdm import tqdm
+import time
+from lib import utils
+_PERMUTE = torch.arange(256).reshape(2, 8, 2, 4, 2).permute(1, 3, 2, 0,
+                                                            4).flatten()
+_PERMUTE_HALF = torch.arange(128).reshape(2, 8, 2, 4, 1).permute(1, 3, 2, 0,
+                                                            4).flatten()
+_INV_PERMUTE = torch.zeros(256, dtype=torch.int64)
+_INV_PERMUTE[_PERMUTE] = torch.arange(256)
+_INV_PERMUTE_HALF = torch.zeros(128, dtype=torch.int64)
+_INV_PERMUTE_HALF[_PERMUTE_HALF] = torch.arange(128)
+def LDLQ(Wr, L, cb, args, D=None, buf_cols=128, for_kernel=True, use_beam_search=False, use_diag=False):
+    if for_kernel:
+        assert args.td_x == 16 and args.td_y == 16
+    buf_cols = max(buf_cols, args.td_y)
+    trellissz = args.td_x * args.td_y
+    (m, n) = Wr.shape
+    assert buf_cols % args.td_y == 0
+    assert n % buf_cols == 0
+    assert args.td_y % args.V == 0
+    buf_size = buf_cols // args.td_y
+    hatWr_T = torch.zeros(n, m, dtype=L.dtype, device=L.device)
+    Qidxs_T = torch.zeros(n // args.V, m, dtype=cb.idx_dtype, device=L.device)
+    device = Wr.device
+    Wr = Wr.cpu()
+    utils.clean()
+    Wr_T = Wr.T.contiguous().to(device)
+    # quip
+    prod_cache = torch.zeros(n, m, dtype=Wr_T.dtype, device=Wr_T.device)
+    for cur_col in tqdm(range(n // args.td_y, 0, -buf_size)):
+        b_Wr_T = Wr_T[args.td_y * (cur_col - buf_size):args.td_y * cur_col]
+        b_hatWr_T = hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                            cur_col]
+        b_L = L[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col].contiguous()
+        b_prod = prod_cache[args.td_y * (cur_col - buf_size):args.td_y *
+                            cur_col]
+        b_Qidxs_T = Qidxs_T[args.td_y * (cur_col - buf_size) //
+                            args.V:args.td_y * cur_col // args.V]
+        L_offset = args.td_y * (cur_col - buf_size)
+        for i in reversed(range(buf_size)):
+            WXWX = b_Wr_T[args.td_y * i : args.td_y * (i + 1)] + \
+                b_L[args.td_y * (i + 1):, L_offset + args.td_y * i : L_offset + args.td_y * (i + 1)].T @ \
+                (b_Wr_T[args.td_y * (i + 1):] - b_hatWr_T[args.td_y * (i + 1):]) + \
+                b_prod[args.td_y * i : args.td_y * (i + 1)]
+            if trellissz > -1:
+                WXWXshape = WXWX.shape
+                thing = WXWX.T.reshape(-1, trellissz)
+                if for_kernel:
+                    thing = thing[..., _PERMUTE]
+                if use_beam_search:
+                    # D: (n // td_y, td_y, td_y)
+                    D_cur = D[cur_col - buf_size + i] # (td_y, td_y)
+                    D_tiled = torch.kron(torch.eye(args.td_y, device=D_cur.device, dtype=D_cur.dtype), D_cur)
+                    if for_kernel:
+                        D_tiled = D_tiled[:, _PERMUTE][_PERMUTE, :]
+                    q_out = cb.quantize_beam_search_with_hessian(thing, D_tiled, beam_sz=1024)
+                else:
+                    if use_diag:
+                        D_cur = D[cur_col - buf_size + i] # (td_y, td_y)
+                        weight = torch.diag(D_cur).repeat(trellissz // args.td_y)[_PERMUTE]
+                        q_out = cb.quantize(thing, w2=weight)
+                    else:
+                        q_out = cb.quantize(thing)
+                if for_kernel:
+                    thing = q_out[0][..., _INV_PERMUTE].reshape(
+                        WXWXshape[1], WXWXshape[0])
+                else:
+                    thing = q_out[0].reshape(WXWXshape[1], WXWXshape[0])
+                idxs = q_out[1].reshape(WXWXshape[1], WXWXshape[0] // args.V)
+                b_hatWr_T[args.td_y * i:args.td_y * (i + 1)] = thing.T
+                b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V *
+                          (i + 1)] = idxs.T
+            else:
+                raise NotImplementedError
+                # q_out = cb.quantize(WXWX.T)
+                # b_hatWr_T[args.td_y * i:args.td_y * (i + 1)] = q_out[0].T
+                # b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V *
+                #           (i + 1)] = q_out[1].T
+        prod_cache += b_L.T @ (b_Wr_T - b_hatWr_T)
+        hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col] = b_hatWr_T
+    del b_Wr_T, b_hatWr_T, b_L, b_prod, L_offset, prod_cache
+    utils.clean()
+    return hatWr_T.T.contiguous(), Qidxs_T.T.contiguous()
+def calc_obj(hatWr_T, Wr_T, HRr):
+    diff_T = hatWr_T.cuda() - Wr_T.cuda()
+    obj = torch.trace(diff_T.T @ HRr @ diff_T)
+    return obj.cpu().item()
+def CD(Wr, HRr, Qidxs, hatWr, cb, args, buf_cols=128, for_kernel=True, use_beam_search=False):
+    if for_kernel:
+        assert args.td_x == 16 and args.td_y == 16
+    buf_cols = max(buf_cols, args.td_y)
+    trellissz = args.td_x * args.td_y
+    (m, n) = Wr.shape
+    assert buf_cols % args.td_y == 0
+    assert n % buf_cols == 0
+    assert args.td_y % args.V == 0
+    buf_size = buf_cols // args.td_y
+    hatWr_T = hatWr.T.contiguous()
+    Qidxs_T = Qidxs.T.contiguous()
+    device = hatWr.device
+    hatWr = hatWr.cpu()
+    utils.clean()
+    Wr_T = Wr.T.contiguous().to(device)
+    # obj = calc_obj(hatWr_T, Wr_T, HRr)
+    # print("init obj", obj)
+    for cur_col in tqdm(range(n // args.td_y, 0, -buf_size)):
+        b_Wr_T = Wr_T[args.td_y * (cur_col - buf_size):args.td_y * cur_col]
+        b_hatWr_T = hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                            cur_col]
+        b_Qidxs_T = Qidxs_T[args.td_y * (cur_col - buf_size) //
+                            args.V:args.td_y * cur_col // args.V]
+        b_HRr = HRr[args.td_y * (cur_col - buf_size):args.td_y * cur_col, args.td_y * (cur_col - buf_size):args.td_y * cur_col] # (buf_size * td_y, buf_size * td_y)
+        # update global hessian
+        res_inds = torch.cat([
+            torch.arange(0, (cur_col - buf_size) * args.td_y, device=device),
+            torch.arange(cur_col * args.td_y, n, device=device)
+        ])
+        Wr_diff_T = hatWr_T - Wr_T # (n, m)
+        b_global_hess = torch.matmul(Wr_diff_T[res_inds].T, HRr[res_inds, args.td_y * (cur_col - buf_size):args.td_y * cur_col]) # (m, buf_size * td_y)
+        for i in reversed(range(buf_size)):
+            start_col, end_col = args.td_y * i, args.td_y * (i + 1)
+            WXWX = b_Wr_T[start_col:end_col] # (td_y, m)
+            b_Wr_diff_T = b_hatWr_T - b_Wr_T # (td_y * buf_size, m)
+            if trellissz > -1:
+                WXWXshape = WXWX.shape
+                thing = WXWX.T.reshape(-1, trellissz) # (-1, trellissz)
+                if for_kernel:
+                    thing = thing[..., _PERMUTE]
+                # local hessian
+                HRr_cur = b_HRr[start_col:end_col, start_col:end_col].contiguous() # (td_y, td_y)
+                HRr_tiled = torch.kron(torch.eye(args.td_y, device=HRr_cur.device, dtype=HRr_cur.dtype), HRr_cur)
+                # global hessian
+                cur_global_hess = b_global_hess[:, start_col:end_col] # (m, td_y)
+                cur_res_ind =  torch.cat([
+                                        torch.arange(0, start_col, device=device),
+                                        torch.arange(end_col, buf_size * args.td_y, device=device)
+                                    ]) # 나머지 indices for args.td_y * i : args.td_y * (i + 1)
+                cur_global_hess_res = torch.matmul(b_Wr_diff_T[cur_res_ind].T, b_HRr[cur_res_ind, start_col:end_col]) # (m, td_y)
+                cur_weight = cur_global_hess + cur_global_hess_res # (m, td_y)
+                cur_weight = cur_weight.reshape(-1, trellissz)
+                if for_kernel:
+                    cur_weight = cur_weight[..., _PERMUTE] # (-1, trellissz)
+                    HRr_tiled = HRr_tiled[:, _PERMUTE][_PERMUTE, :]
+                cur_hatWr_T = b_hatWr_T[start_col:end_col].T.reshape(-1, trellissz)[..., _PERMUTE].contiguous() # (-1, trellissz)
+                cur_qidx = b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V * (i + 1)].T.reshape(-1, trellissz // args.V) # (-1, trellissz)
+                diff = cur_hatWr_T - thing
+                obj_before = torch.diag(diff @ HRr_tiled @ diff.T) + torch.sum(cur_weight * diff, dim=-1) * 2 # (-1)
+                if use_beam_search:
+                    q_out = cb.quantize_beam_search_with_hessian(thing, HRr_tiled, U=cur_weight, beam_sz=1024)
+                else:
+                    q_out = cb.quantize(thing, w1=cur_weight * 2, w2=torch.diag(HRr_tiled))
+                diff = q_out[0] - thing
+                obj_after = torch.diag(diff @ HRr_tiled @ diff.T) + torch.sum(cur_weight * diff, dim=-1) * 2 # (-1)
+                # select only improved
+                improved = obj_before > obj_after
+                # out[i] = q_out[0][i] if improved[i] else cur_hatWr_T[i]
+                new_hatWr_T = torch.where(improved.unsqueeze(-1), q_out[0], cur_hatWr_T)
+                new_qidx = torch.where(improved.unsqueeze(-1), q_out[1], cur_qidx)
+                if for_kernel:
+                    thing = new_hatWr_T[..., _INV_PERMUTE].reshape(
+                        WXWXshape[1], WXWXshape[0])
+                else:
+                    thing = new_hatWr_T.reshape(WXWXshape[1], WXWXshape[0])
+                idxs = new_qidx.reshape(WXWXshape[1], WXWXshape[0] // args.V)
+                b_hatWr_T[args.td_y * i:args.td_y * (i + 1)] = thing.T
+                b_Qidxs_T[args.td_y // args.V * i:args.td_y // args.V *
+                          (i + 1)] = idxs.T
+                hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col] = b_hatWr_T
+            else:
+                raise NotImplementedError
+        hatWr_T[args.td_y * (cur_col - buf_size):args.td_y *
+                cur_col] = b_hatWr_T
+        # obj = calc_obj(hatWr_T, Wr_T, HRr)
+        # print("cur_col", cur_col, "obj", obj)
+    del b_Wr_T, b_hatWr_T
+    utils.clean()
+    return hatWr_T.T.contiguous(), Qidxs_T.T.contiguous()

lib/codebook/__pycache__/bitshift.cpython-311.pyc ADDED Viewed

Binary file (30.8 kB). View file

lib/codebook/__pycache__/vq_codebook.cpython-311.pyc ADDED Viewed

Binary file (3.9 kB). View file

lib/codebook/bitshift.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import itertools
+import math
+import os
+from functools import cache
+import numpy as np
+import torch
+from torch import nn
+from tqdm import tqdm
+from lib.utils.kernel_check import has_kernel
+from lib.utils.kernel_decompress import decode_compressed, bitshift_linear_kernel
+from lib.utils.matmul_had import matmul_hadU_cuda, matmul_hadUt_cuda
+import time
+def decode_1mad(x):
+    x = x.to(torch.int64)
+    x = x & ((1 << 32) - 1)
+    x = x * 34038481 + 76625530
+    x = x & ((1 << 32) - 1)
+    y = (x & 255) + ((x >> 8) & 255) + ((x >> 16) & 255) + ((x >> 24) & 255)
+    y = y - 510
+    y = y.to(torch.float32)
+    y = y / 147.800537109375
+    return y
+def decode_2mad(x):
+    x = x.to(torch.int64)
+    x = x & ((1 << 32) - 1)
+    x = x * 264435761 + 1013904223
+    x = x & ((1 << 32) - 1)
+    x = ((x * 1664525) >> 32) + x
+    x = x & ((1 << 32) - 1)
+    y = (x & 255) + ((x >> 8) & 255) + ((x >> 16) & 255) + ((x >> 24) & 255)
+    y = y - 510
+    y = y.to(torch.float32)
+    y = y / 147.800537109375
+    return y
+def decode_3inst(x):
+    def bfe16_to_fp16(x):
+        x[torch.where(x >= 2**15)] -= 2**16
+        return torch.tensor(x.to(torch.int16).numpy().view(np.float16))
+    a = 89226354
+    b = 64248484
+    fpmask = 996162400
+    x = x.to(torch.int64)
+    x = x & ((1 << 32) - 1)
+    x = x * a + b
+    mask = (1 << 15) + ((1 << 12) - 1)
+    mask = (mask << 16) + mask
+    res = (mask & x) ^ fpmask
+    top = bfe16_to_fp16(res >> 16)
+    bottom = bfe16_to_fp16(res & ((1 << 16) - 1))
+    return (top + bottom).float()
+def quantlut(tlut, L, nbits):
+    with torch.no_grad():
+        lut = torch.arange(1 << L)
+        lut = (lut + 1) * lut
+        lut = (lut >> (16 - nbits)) & ((1 << nbits) - 1)
+    lut = tlut[lut]
+    return lut
+def quantlut_sym(tlut, L, nbits):
+    with torch.no_grad():
+        lut = torch.arange(1 << L, device=tlut.device)
+        lut = (lut + 1) * lut
+        sflp = 1 - ((lut >> 15) & 1) * 2
+        lut = (lut >> (16 - nbits - 1)) & ((1 << nbits) - 1)
+    lut = tlut[lut]
+    lut[:, 0] = lut[:, 0] * sflp
+    return lut
+class bitshift_codebook(nn.Module):
+    def __init__(self,
+                 L=16,
+                 KV=4,
+                 V=2,
+                 tlut_bits=16,
+                 decode_mode='lut',
+                 tlut=None):
+        super(bitshift_codebook, self).__init__()
+        self.idx_dtype = torch.int32
+        self.opt_scale = 1
+        self.L = L
+        self.KV = KV
+        self.V = V
+        self.tlut_bits = tlut_bits
+        self.decode_mode = decode_mode
+        if decode_mode == 'lut':
+            if tlut is None:
+                assert tlut_bits == L
+                self.register_buffer('tlut', torch.randn(2**L, V))
+                self.register_buffer('lut', self.tlut.T.contiguous())
+            else:
+                self.tlut = tlut
+                self.recons_lut()
+        elif decode_mode == '1mad':
+            assert V == 1
+            self.register_buffer('lut',
+                                 decode_1mad(torch.arange(2**L)).unsqueeze(0))
+        elif decode_mode == '2mad':
+            assert V == 1
+            self.register_buffer('lut',
+                                 decode_2mad(torch.arange(2**L)).unsqueeze(0))
+        elif decode_mode == '3inst':
+            assert V == 1
+            self.register_buffer('lut',
+                                 decode_3inst(torch.arange(2**L)).unsqueeze(0))
+        elif decode_mode == 'quantlut':
+            if tlut is None:
+                assert tlut_bits > 0
+                if V == 1:
+                    tlut = torch.erfinv((torch.arange(1 << tlut_bits) + 0.5) /
+                                        (1 << tlut_bits) * 2 -
+                                        1) * torch.tensor(2.0).sqrt()
+                elif V == 2:
+                    n = 2**tlut_bits
+                    tlut = torch.zeros(n)
+                    R = ((n / (n - torch.arange(n))).log() * 2).sqrt()
+                    tlut = torch.stack(
+                        [R * torch.arange(n).sin(), R * torch.arange(n).cos()],
+                        dim=-1)
+                else:
+                    raise Exception
+                self.register_buffer('tlut', tlut.unsqueeze(-1))
+                self.register_buffer(
+                    'lut',
+                    quantlut(self.tlut, L, tlut_bits).T.contiguous())
+            else:
+                self.tlut = tlut
+                self.recons_lut()
+        elif decode_mode == 'quantlut_sym':
+            if tlut is None:
+                assert tlut_bits > 0
+                if V == 2:
+                    fname = f'assets/lut_cache/kmeans_{tlut_bits}_{V}.pt'
+                    if not os.path.exists(fname):
+                        tlut = torch.randn(2**tlut_bits, V)
+                        import scipy
+                        data = torch.randn(1 << 20, 2)
+                        clusters = scipy.cluster.vq.kmeans(data, tlut)
+                        tlut = torch.tensor(clusters[0])
+                        tlut = (tlut /
+                                tlut.std(unbiased=False)) * 0.9682458365518543
+                        torch.save(tlut, fname)
+                    else:
+                        tlut = torch.load(fname)
+                else:
+                    raise Exception
+                self.register_buffer('tlut', tlut)
+                self.register_buffer(
+                    'lut',
+                    quantlut_sym(self.tlut, L, tlut_bits).T.contiguous())
+            else:
+                self.tlut = tlut
+                self.recons_lut()
+        else:
+            raise Exception
+        self.fakeinf = torch.tensor(torch.inf)
+        self.register_buffer('sumdelta',
+                             torch.arange(2**(KV)) << (L - KV))
+        self.sumdelta = self.sumdelta.view(1, 1, -1)
+        self.register_buffer('state', torch.arange(2**L).unsqueeze(0))
+        self.register_buffer('state_cand',
+                             (self.state >>
+                              (KV))[0, ::2**(KV)].unsqueeze(-1) +
+                             self.sumdelta)
+        self.register_buffer('recons_state', self.recons(self.state))
+        self.version = 0
+    def recons_lut(self):
+        if self.decode_mode == 'lut':
+            self.lut = self.tlut.T.contiguous()
+        elif self.decode_mode == 'quantlut':
+            self.lut = quantlut(self.tlut, self.L,
+                                self.tlut_bits).T.contiguous()
+        elif self.decode_mode == 'quantlut_sym':
+            self.lut = quantlut_sym(self.tlut, self.L,
+                                    self.tlut_bits).T.contiguous()
+    def recons(self, encoded, **kwargs):
+        return self.lut[:,
+                        encoded.int().to(self.lut.device)].to(encoded.device)
+    @torch.compile
+    def update(self, cost, thing):
+        state_err = (self.recons_state -
+                     thing.unsqueeze(-1)).square().sum(dim=0)
+        cand_cost = torch.gather(
+            cost.unsqueeze(-2).expand(-1, self.state_cand.shape[1], -1), -1,
+            self.state_cand.expand(len(cost), -1, 2**(self.KV)))
+        best = torch.min(cand_cost, dim=-1)
+        cost = state_err + best.values.unsqueeze(-1).expand(
+            -1, -1, 2**(self.KV)).reshape(state_err.shape)
+        prev_state = torch.gather(
+            self.state_cand.expand(thing.shape[1], -1, -1), -1,
+            best.indices.unsqueeze(-1))[..., 0]
+        return prev_state, cost
+    def viterbi(self, X, overlap=None):
+        """
+            X (T, B)
+        """
+        T, B = X.shape
+        assert T % self.V == 0
+        # cost is (B, 2**L)
+        cost = (self.recons_state -
+                X[:self.V].unsqueeze(-1)).square().sum(dim=0)
+        if overlap is not None:
+            mask = torch.ones(B, 2**self.L, device=X.device) * self.fakeinf
+            allow = (overlap <<
+                     (self.KV)).unsqueeze(-1) + torch.arange(
+                         2**(self.KV)).to(X.device).view(1, 1, -1)
+            mask.scatter_(1, allow[0], 0)
+            cost = torch.min(cost + mask, self.fakeinf)
+        from_state = torch.zeros(T // self.V,
+                                 B,
+                                 2**(self.L - self.KV),
+                                 dtype=self.state.dtype,
+                                 device=self.state.device)
+        for i in range(1, T // self.V):
+            from_state[i], cost = self.update(cost,
+                                                  X[i * self.V:(i + 1) * self.V])
+        if overlap is not None:
+            mask = torch.ones(B, 2**self.L, device=X.device) * self.fakeinf
+            allow = (overlap.unsqueeze(-1) + self.sumdelta.unsqueeze(0))
+            mask.scatter_(1, allow[0, 0], 0)
+            cost = torch.min(cost + mask, self.fakeinf)
+        final_state = torch.zeros(T // self.V,
+                                  B,
+                                  dtype=self.idx_dtype,
+                                  device=X.device)
+        final_state[T // self.V - 1] = torch.argmin(cost, dim=-1)
+        for i in range(T // self.V - 1, 0, -1):
+            final_state[i - 1] = torch.gather(
+                from_state[i], -1,
+                (final_state[i].to(torch.int64).unsqueeze(-1)) >>
+                (self.KV))[..., 0]
+        return final_state
+    def quantize_seq(self, X, overlap=None, **kwargs):
+        T, NO = X.shape
+        bs = min(2**(24 - self.L), NO)
+        pad_amt = math.ceil(NO / bs) * bs - NO
+        X = torch.nn.functional.pad(X, (0, pad_amt))
+        T, N = X.shape
+        X = X.reshape(T, N // bs, bs).transpose(0, 1).contiguous()
+        if overlap is not None:
+            overlap = torch.nn.functional.pad(overlap, (0,  pad_amt))
+            overlap = overlap.reshape(N // bs, bs)
+        Qidxs = torch.zeros(N // bs,
+                            T // self.V,
+                            bs,
+                            dtype=self.idx_dtype,
+                            device=X.device)
+        for i in range(len(X)):
+            b_overlap = None if overlap is None else overlap[i]
+            Qidxs[i] = self.viterbi(X[i], overlap=b_overlap)
+        Qidxs = Qidxs.transpose(0, 1).reshape(T // self.V, N)[:, :NO]
+        return Qidxs
+    def quantize(self, X, **kwargs):
+        X = X.T.contiguous().to(torch.float16)
+        T = X.shape[0]
+        roll_X = torch.roll(X, T // (2 * self.V) * self.V, 0)
+        state = self.quantize_seq(roll_X, overlap=None)
+        overlap = state[T // (2 * self.V)] >> self.KV
+        state = self.quantize_seq(X, overlap=overlap)
+        hatX = self.recons(state).transpose(0, 1).reshape(X.shape)
+        return hatX.T.contiguous().to(X.device), state.T.contiguous().to(
+            X.device)
+    def pack_trellis(self, trellis):
+        # T is really T // self.V here
+        B, T = trellis.shape
+        bf = torch.zeros(B,
+                         T * self.KV + self.L - self.KV,
+                         dtype=bool,
+                         device=trellis.device)
+        bf[:, :self.L] = (trellis[:, 0].unsqueeze(-1) & (2**torch.arange(
+            self.L, device=trellis.device).flip(dims=(-1, ))).unsqueeze(0)) > 0
+        K_mask = 2**torch.arange(
+            self.KV,
+            device=trellis.device).flip(dims=(-1, )).unsqueeze(0)
+        for i in range(1, T):
+            assert ((trellis[:, i - 1] &
+                     ((1 << (self.L - self.KV)) - 1)) == (
+                         trellis[:, i] >> (self.KV))).all()
+            bf[:,
+               (self.L +
+                (i - 1) * self.KV):(self.L + i * self.KV)] = (
+                    (trellis[:, i] &
+                     ((1 <<
+                       (self.KV)) - 1)).unsqueeze(-1) & K_mask) > 0
+        bf = bf[:, :-(self.L - self.KV)]
+        pad_amt = math.ceil(
+            T * self.KV / 16) * 16 - T * self.KV
+        bf = torch.nn.functional.pad(bf, (0, pad_amt)).reshape(
+            -1, (T * self.KV + pad_amt) // 16, 16)
+        uint_mask = (2**torch.arange(
+            16, dtype=torch.int32,
+            device=bf.device)).flip(dims=(-1, )).unsqueeze(0).unsqueeze(0)
+        bf_sum = (bf.to(torch.int32) * uint_mask).sum(dim=-1)
+        return bf_sum.to(torch.uint16)
+class BitshiftLinear(nn.Module):
+    def __init__(self,
+                 td_x,
+                 td_y,
+                 L,
+                 K,
+                 V,
+                 tlut_bits,
+                 decode_mode,
+                 dtype=torch.float16,
+                 tlut=None,
+                 has_kernel=False):
+        super().__init__()
+        self.td_x = td_x
+        self.td_y = td_y
+        self.V = V
+        self.cb = bitshift_codebook(L, K, V, tlut_bits, decode_mode, tlut=tlut)
+        self.internal_dtype = dtype
+        self.has_kernel = has_kernel
+        self.scale = 32
+    def get_hatW(self, unpacked_trellis, m, n):
+        return self.cb.recons(unpacked_trellis).transpose(0, 1).transpose(
+            1, 2).reshape(m // self.td_x, n // self.td_y, self.td_x,
+                          self.td_y).transpose(1, 2).reshape(m, n)
+    def get_hatW_kernel(self, trellis, m, n):
+        out = decode_compressed(self.cb.L, self.cb.tlut_bits, self.cb.K,
+                                int(math.log2(self.V)), m, n, trellis.view(-1),
+                                self.cb.lut.T)
+        return out
+    def cache_hatW(self, packed_trellis, had_left, had_right, K_left, K_right,
+                   m, n, rcp, tp_rank):
+        if self.has_kernel:
+            hatW = self.get_hatW_kernel(packed_trellis, m, n)
+        else:
+            hatW = self.get_hatW(
+                self.cb.unpack_trellis(packed_trellis, self.td_x * self.td_y),
+                m, n)
+        hatW = hatW.float() / self.scale
+        if rcp == 1:
+            self.hatW = matmul_hadU_cuda(
+                matmul_hadU_cuda(hatW.reshape(tp_rank * m, n // tp_rank),
+                                 had_left, K_left).reshape(m, n).T, had_right,
+                K_right).T.contiguous().to(self.internal_dtype)
+        elif rcp == 2:
+            self.hatW = matmul_hadU_cuda(
+                matmul_hadU_cuda(hatW, had_left,
+                                 K_left).T.reshape(tp_rank * n,
+                                                   m // tp_rank), had_right,
+                K_right).reshape(n, m).T.contiguous().to(self.internal_dtype)
+        else:
+            self.hatW = matmul_hadU_cuda(
+                matmul_hadU_cuda(hatW, had_left, K_left).T, had_right,
+                K_right).T.contiguous().to(self.internal_dtype)
+    def forward(self,
+                input,
+                trellis,
+                SU,
+                SV,
+                had_left,
+                had_right,
+                K_left,
+                K_right,
+                rcp,
+                tp_rank,
+                mode='eval',
+                use_prev_kernel=False,
+                **kwargs):
+        n, m = len(SU), len(SV)
+        x = input.view(-1, n).to(torch.float32)
+        x = x * SU
+        if mode == 'train-fixW':
+            x = (x.to(self.internal_dtype) @ self.hatW.T).float()
+        else:
+            bs = x.shape[0]
+            if rcp == 1:
+                x = matmul_hadUt_cuda(x.reshape(-1, n // tp_rank), had_left,
+                                      K_left).reshape(x.shape) / self.scale
+            else:
+                x = matmul_hadUt_cuda(x, had_left, K_left) / self.scale
+            if bs == 1 and self.has_kernel:
+                wrapper = getattr(
+                    torch.ops.quip_lib,
+                    f"decompress_gemm_tcq_{m}_1_{x.numel()}_{self.cb.K}")
+                x = wrapper(trellis, x, self.cb.tlut)
+            else:
+                if mode == 'train-recons':
+                    self.cb.recons_lut()
+                if self.has_kernel:
+                    if use_prev_kernel:
+                        x = BitshiftLinearKernelAG.apply(
+                            x, trellis, m, n, self.cb.L, self.cb.tlut_bits, self.cb.K,
+                            self.V, self.cb.lut).float()
+                    else:
+                        x = bitshift_linear_kernel(
+                            x, trellis, m, n, self.cb.L, self.cb.tlut_bits, self.cb.K,
+                            self.V, self.cb.lut).float()
+                else:
+                    if mode == 'eval':
+                        trellis = self.cb.unpack_trellis(
+                            trellis, self.td_x * self.td_y)
+                    hatW = self.get_hatW(trellis, m, n)
+                    x = (x.to(hatW.dtype) @ hatW.T).float()
+            if rcp == 2:
+                x = matmul_hadU_cuda(x.reshape(-1, m // tp_rank), had_right,
+                                     K_right).reshape(x.shape)
+            else:
+                x = matmul_hadU_cuda(x, had_right, K_right)
+        x = x.to(SV.device) * (SV * self.scale)
+        return x.view(*input.shape[:-1], m).to(input.dtype)
+class BitshiftLinearKernelAG(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, trellis, m, n, L, tlut_bits, K, V, lut):
+        ctx.save_for_backward(trellis, lut)
+        ctx.L = L
+        ctx.tlut_bits = tlut_bits
+        ctx.K = K
+        ctx.V = V
+        ctx.m = m
+        ctx.n = n
+        hatW = decode_compressed(L, tlut_bits, K, int(math.log2(V)),
+                                 m, n, trellis.view(-1), lut.T)
+        return input.to(hatW.dtype) @ hatW.T
+    @staticmethod
+    def backward(ctx, grad_output):
+        trellis, lut = ctx.saved_tensors
+        L = ctx.L
+        tlut_bits = ctx.tlut_bits
+        K = ctx.K
+        V = ctx.V
+        m = ctx.m
+        n = ctx.n
+        hatW = decode_compressed(L, tlut_bits, K, int(math.log2(V)),
+                                 m, n, trellis.view(-1), lut.T)
+        grad_input = grad_output.to(hatW.dtype) @ hatW
+        return grad_input, None, None, None, None, None, None, None, None

lib/codebook/vq_codebook.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import torch
+from torch import nn
+from lib.utils.kmeans import kmeans_flash1d, kmeans_sklearn
+class vq_codebook(nn.Module):
+    def __init__(self,
+                 vec_sz=2,
+                 lut_bits=8):
+        super(vq_codebook, self).__init__()
+        self.idx_dtype = torch.int32
+        self.vec_sz = vec_sz
+        self.lut_bits = lut_bits
+        fname = f'assets/lut_cache/vq_kmeans_{lut_bits}_{vec_sz}.pt'
+        if not os.path.exists(fname):
+            if vec_sz == 1:
+                data = torch.randn(int(1e8), vec_sz)
+                tlut = kmeans_flash1d(data, 2**lut_bits)
+            elif vec_sz in [2,4]:
+                data = torch.randn(int(1e8), vec_sz)
+                if lut_bits <= 5:
+                    tlut = kmeans_sklearn(data, 2**lut_bits, max_data=int(1e8))
+                else:
+                    tlut = kmeans_sklearn(data, 2**lut_bits, max_data=int(1e7))
+            torch.save(tlut, fname)
+        else:
+            tlut = torch.load(fname)
+        self.register_buffer("tlut", tlut)
+        self.register_buffer("lut", tlut.T.contiguous())
+    def recons(self, encoded, **kwargs):
+        return self.tlut[encoded].contiguous()
+    def quantize(self, X, **kwargs):
+        """
+            X : [B, vec_sz]
+        """
+        dist = torch.cdist(X, self.tlut.to(X.device, dtype=X.dtype)) # [B, 2**lut_bits]
+        state = torch.argmin(dist, dim=-1) # [B,] each entry is in [0, 2**lut_bits)
+        hatX = self.recons(state)
+        return hatX.to(X.device), state.to(X.device)
+if __name__ == "__main__":
+    for vec_sz in [4]:
+        for lut_bits in [6,7,8,9,10,11,12]:
+        # for lut_bits in [1,2,3,4,5,6,7,8,9,10,11,12]:
+            if vec_sz == 1 and lut_bits > 8:
+                continue
+            vq = vq_codebook(vec_sz=vec_sz, lut_bits=lut_bits)
+            X = torch.randn(int(1e5), vec_sz)
+            hatX, state = vq.quantize(X)
+            print(f"vec_sz: {vec_sz}, lut_bits: {lut_bits}, mse: {(hatX-X).pow(2).mean()}")

lib/config.py ADDED Viewed

	@@ -0,0 +1,6 @@

+MODEL_KEYS = {
+    "meta-llama/Llama-3.1-8B": "3_8b",
+    "meta-llama/Llama-3.2-1B": "3_1b",
+    "meta-llama/Llama-3.2-3B": "3_3b",
+    "Qwen/Qwen2.5-7B": "qwen_7b",
+}

lib/linear/__init__.py ADDED Viewed

	@@ -0,0 +1,430 @@

+from .quantized_linear import QuantizedLinear
+from .incoherent_linear import IncoherentLinear
+from .vq_linear import VQLinearPackSIMT, VQLinearPackTensorCore
+from .tcq_linear import QTIPLinearTCQ
+from .comb_linear import CombLinearTCQ, CombtLinearTCQ
+import vq_tensor_kernels
+import torch
+kernels = [
+  (53248, 16384),
+  (16384, 53248),
+  (1024, 16384),
+  (16384, 16384),
+  (4096, 14336),
+  (14336, 4096),
+  (28672, 4096),
+  (2048, 4096),
+  (5120, 4096),
+  (6144, 4096),
+  (1024, 4096),
+  (4096, 4096),
+  (4096, 11008),
+  (11008, 4096),
+  (22016, 4096),
+  (12288, 4096),
+  (8192, 4096),
+  (8192, 8192),
+  (10240, 8192),
+  (57344, 8192),
+  (8192, 1024),
+  (8192, 28672),
+  (28672, 8192),
+  (1024, 8192),
+  (5120, 5120),
+  (10240, 5120),
+  (15360, 5120),
+  (13568, 5120),
+  (27136, 5120),
+  (5120, 13568),
+]
+kdict = {}
+for vtype, max_bits, bit_stride in [
+    ('sq_dup', 4, 1),
+    ('sq', 8, 1),
+    ('vq2', 12, 1),
+]:
+    for m, k in kernels:
+        for n in [1,2,4,8]:
+            for bitrate in range(2,max_bits+1, bit_stride):
+                torch.library.define(
+                    f"ours_lib::decompress_gemm_{m}_{n}_{k}_{bitrate}_{vtype}",
+                    "(Tensor compressed, Tensor x, Tensor codebook) -> Tensor")
+                name = f"decompress_gemm_{m}_{n}_{k}_{bitrate}_{vtype}"
+                kernel_name = f"vq_tensor_kernels.decompress_gemm_{bitrate}_{m}_{n}_{k}_{vtype}"
+                exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+        compressed: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    return torch.zeros({n}, {m}, dtype=torch.float32, device=x.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+        compressed: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    out = torch.zeros(({n}, {m}), dtype=torch.float32, device=x.device)
+    {kernel_name}(out, compressed.reshape(-1).view(torch.int32), x.to(torch.float16), codebook.reshape(-1))
+    return out
+            """)
+        for bitrate in range(2,max_bits+1, bit_stride):
+            name = f"decompress_gemv_{m}_{k}_{bitrate}_{vtype}"
+            kernel_name = f"vq_tensor_kernels.decompress_gemm_{bitrate}_{m}_{1}_{k}_{vtype}"
+            exec(f"""\
+@torch.library.custom_op("ours_lib::{name}", mutates_args={{"out"}})
+def {name}(
+        compressed: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor,
+        out: torch.Tensor) -> torch.Tensor:
+    {kernel_name}(out, compressed.reshape(-1).view(torch.int32), x.to(torch.float16), codebook.reshape(-1))
+@{name}.register_fake
+def {name}_fake(compressed, x, codebook, out):
+    return None
+            """)
+    for bitrate in range(2,max_bits+1, bit_stride):
+        torch.library.define(
+                f"ours_lib::decompress_{bitrate}_{vtype}",
+                "(Tensor compressed, Tensor codebook, int m, int k) -> Tensor")
+        name = f"decompress_{bitrate}_{vtype}"
+        kernel_name = f"vq_tensor_kernels.decompress_{bitrate}_{vtype}"
+        exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+        compressed: torch.Tensor,
+        codebook: torch.Tensor,
+        m: int,
+        k: int) -> torch.Tensor:
+    return torch.zeros(m, k, dtype=torch.float16, device=compresed.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+        compressed: torch.Tensor,
+        codebook: torch.Tensor,
+        m: int,
+        k: int) -> torch.Tensor:
+    out = torch.zeros((m, k), dtype=torch.float16, device=compressed.device)
+    {kernel_name}(out, compressed.reshape(-1).view(torch.int32), codebook.reshape(-1))
+    return out
+            """)
+import tcq_kernels
+import torch
+MKSHAPE = [
+  (53248, 16384),
+  (16384, 53248),
+  (1024, 16384),
+  (16384, 16384),
+  (4096, 14336),
+  (14336, 4096),
+  (28672, 4096),
+  (5120, 4096),
+  (6144, 4096),
+  (512, 4096),
+  (1024, 4096),
+  (2048, 4096),
+  (4096, 4096),
+  (2048, 11008),
+  (4096, 11008),
+  (5504, 4096),
+  (11008, 4096),
+  (22016, 4096),
+  (12288, 4096),
+  (8192, 4096),
+  (8192, 8192),
+  (10240, 8192),
+  (57344, 8192),
+  (8192, 1024),
+  (8192, 28672),
+  (28672, 8192),
+  (1024, 8192),
+  (5120, 5120),
+  (10240, 5120),
+  (15360, 5120),
+  (13568, 5120),
+  (27136, 5120),
+  (5120, 13568),
+  (3072, 3072),
+  (1024, 3072),
+  (4096, 3072),
+  (2048, 3072),
+  (5120, 3072),
+  (8192, 3072),
+  (16384, 3072),
+  (3072, 8192),
+]
+kdict = {}
+for S in [9, 10, 11]:
+    if S == 9:
+        bitrate_list = [2,3,4,5,6,7,8,9,10]
+    elif S == 10:
+        bitrate_list = [8, 9, 10]
+    elif S == 11:
+        bitrate_list = [9, 10]
+    for m, k in MKSHAPE:
+        for n in [1,2,4,8]:
+            for bitrate in bitrate_list:
+                torch.library.define(
+            f"ours_lib::decompress_gemm_tcq_{m}_{n}_{k}_{S}_{bitrate}",
+            "(Tensor compressed, Tensor x, Tensor codebook) -> Tensor")
+                name = f"decompress_gemm_tcq_{m}_{n}_{k}_{S}_{bitrate}"
+                kernel_name = f"tcq_kernels.decompress_gemm_16_{S}_{bitrate}_1_{m}_{n}_{k}"
+                exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+        compressed: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    return torch.zeros({n}, {m}, dtype=torch.float32, device=x.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+        compressed: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    out = torch.zeros(({n}, {m}), dtype=torch.float32, device=x.device)
+    {kernel_name}(out, compressed.reshape(-1).view(torch.int32), x.to(torch.float16), codebook.reshape(-1))
+    return out
+    """)
+                if bitrate == bitrate_list[-1]:
+                    continue
+                torch.library.define(
+            f"ours_lib::decompress_gemm_tcq_comb_{m}_{n}_{k}_{S}_{bitrate}_{int(bitrate+1)}",
+            "(Tensor compressed1, Tensor compressed2, Tensor x, Tensor codebook) -> Tensor")
+                name = f"decompress_gemm_tcq_comb_{m}_{n}_{k}_{S}_{bitrate}_{int(bitrate+1)}"
+                kernel_name = f"tcq_kernels.decompress_gemm_comb_16_{S}_{bitrate}_{int(bitrate+1)}_1_{m}_{n}_{k}"
+                exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+        compressed1: torch.Tensor,
+        compressed2: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    return torch.zeros({n}, {m}, dtype=torch.float32, device=x.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+        compressed1: torch.Tensor,
+        compressed2: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    out = torch.zeros(({n}, {m}), dtype=torch.float32, device=x.device)
+    {kernel_name}(out, compressed1.reshape(-1).view(torch.int32), compressed2.reshape(-1).view(torch.int32), x.to(torch.float16), codebook.reshape(-1))
+    return out
+    """)
+                torch.library.define(
+            f"ours_lib::decompress_gemm_tcq_combt_{m}_{n}_{k}_{S}_{bitrate}_{int(bitrate+1)}",
+            "(Tensor compressed1, Tensor compressed2, Tensor x, Tensor codebook) -> Tensor")
+                name = f"decompress_gemm_tcq_combt_{m}_{n}_{k}_{S}_{bitrate}_{int(bitrate+1)}"
+                kernel_name = f"tcq_kernels.decompress_gemm_combt_16_{S}_{bitrate}_{int(bitrate+1)}_1_{m}_{n}_{k}"
+                exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+        compressed1: torch.Tensor,
+        compressed2: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    return torch.zeros({n}, {m}, dtype=torch.float32, device=x.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+        compressed1: torch.Tensor,
+        compressed2: torch.Tensor,
+        x: torch.Tensor,
+        codebook: torch.Tensor) -> torch.Tensor:
+    out = torch.zeros(({n}, {m}), dtype=torch.float32, device=x.device)
+    {kernel_name}(out, compressed1.reshape(-1).view(torch.int32), compressed2.reshape(-1).view(torch.int32), x.to(torch.float16), codebook.reshape(-1))
+    return out
+    """)
+for S in [9, 10, 11]:
+    if S == 9:
+        bitrate_list = [2,3,4,5,6,7,8,9,10]
+    elif S == 10:
+        bitrate_list = [8, 9, 10]
+    elif S == 11:
+        bitrate_list = [9, 10]
+    for bitrate in bitrate_list:
+        torch.library.define(
+        f"ours_lib::decompress_tcq_{S}_{bitrate}",
+        "(Tensor compressed, Tensor codebook, int m, int k) -> Tensor")
+        name = f"decompress_tcq_{S}_{bitrate}"
+        kernel_name = f"tcq_kernels.decompress_16_{S}_{bitrate}"
+        exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+        compressed: torch.Tensor,
+        codebook: torch.Tensor,
+        m: int,
+        k: int) -> torch.Tensor:
+    return torch.zeros(m, k, dtype=torch.float16, device=compresed.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+        compressed: torch.Tensor,
+        codebook: torch.Tensor,
+        m: int,
+        k: int) -> torch.Tensor:
+    out = torch.zeros((m, k), dtype=torch.float16, device=compressed.device)
+    {kernel_name}(out, compressed.reshape(-1).view(torch.int32), codebook.reshape(-1))
+    return out
+        """)
+        if bitrate == bitrate_list[-1]:
+            break
+        torch.library.define(
+        f"ours_lib::decompress_tcq_comb_{S}_{bitrate}_{int(bitrate+1)}",
+        "(Tensor compressed1, Tensor compressed2, Tensor codebook, int m, int k) -> Tensor")
+        name = f"decompress_tcq_comb_{S}_{bitrate}_{int(bitrate+1)}"
+        kernel_name = f"tcq_kernels.decompress_comb_16_{S}_{bitrate}_{int(bitrate+1)}"
+        exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+    compressed1: torch.Tensor,
+    compressed2: torch.Tensor,
+    codebook: torch.Tensor,
+    m: int, k: int) -> torch.Tensor:
+    return torch.zeros(m, k, dtype=torch.float16, device=compressed1.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+    compressed1: torch.Tensor,
+    compressed2: torch.Tensor,
+    codebook: torch.Tensor,
+    m: int, k: int) -> torch.Tensor:
+    out = torch.zeros((m, k), dtype=torch.float16, device=compressed1.device)
+    {kernel_name}(out, compressed1.reshape(-1).view(torch.int32), compressed2.reshape(-1).view(torch.int32), codebook.reshape(-1))
+    return out
+    """)
+        torch.library.define(
+        f"ours_lib::decompress_tcq_combt_{S}_{bitrate}_{int(bitrate+1)}",
+        "(Tensor compressed1, Tensor compressed2, Tensor codebook, int m, int k) -> Tensor")
+        name = f"decompress_tcq_combt_{S}_{bitrate}_{int(bitrate+1)}"
+        kernel_name = f"tcq_kernels.decompress_combt_16_{S}_{bitrate}_{int(bitrate+1)}"
+        exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(
+    compressed1: torch.Tensor,
+    compressed2: torch.Tensor,
+    codebook: torch.Tensor,
+    m: int, k: int) -> torch.Tensor:
+    return torch.zeros(m, k, dtype=torch.float16, device=compressed1.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(
+    compressed1: torch.Tensor,
+    compressed2: torch.Tensor,
+    codebook: torch.Tensor,
+    m: int, k: int) -> torch.Tensor:
+    out = torch.zeros((m, k), dtype=torch.float16, device=compressed1.device)
+    {kernel_name}(out, compressed1.reshape(-1).view(torch.int32), compressed2.reshape(-1).view(torch.int32), codebook.reshape(-1))
+    return out
+    """)
+import sq_pack_gemm
+import vq_pack_gemm
+"""
+SQ Pack SIMT
+"""
+torch.library.define("ours_lib::sq_pack_gemm_simt", "(Tensor x, Tensor q_weight, Tensor lut, int bitwidth) -> Tensor")
+@torch.library.register_fake("ours_lib::sq_pack_gemm_simt")
+def sq_pack_gemm_simt_abstract(x: torch.Tensor, q_weight: torch.Tensor, lut: torch.Tensor, bitwidth:int) -> torch.Tensor:
+    return torch.zeros(x.shape[0], 1, q_weight.shape[0], dtype=torch.float16, device=x.device)
+@torch.library.impl("ours_lib::sq_pack_gemm_simt", "cuda")
+def sq_pack_gemm_simt_cuda(x: torch.Tensor, q_weight: torch.Tensor, lut: torch.Tensor, bitwidth:int) -> torch.Tensor:
+    output = torch.zeros(x.shape[0], 1, q_weight.shape[0], dtype=torch.float16, device=x.device)
+    sq_pack_gemm.pack_gemm(x, output, q_weight, lut.view(-1), bitwidth)
+    return output
+torch.library.define("ours_lib::sq_pack_dequant_simt", "(Tensor q_weight, Tensor lut, int bitwidth, int m, int k) -> Tensor")
+@torch.library.register_fake("ours_lib::sq_pack_dequant_simt")
+def sq_pack_dequant_simt_abstract(q_weight: torch.Tensor, lut: torch.Tensor, bitwidth:int, m:int, k:int) -> torch.Tensor:
+    return torch.zeros(m, k, dtype=torch.float16, device=q_weight.device)
+@torch.library.impl("ours_lib::sq_pack_dequant_simt", "cuda")
+def sq_pack_dequant_simt_cuda(q_weight: torch.Tensor, lut: torch.Tensor, bitwidth:int, m:int, k:int) -> torch.Tensor:
+    output = torch.zeros(m, k, dtype=torch.float16, device=q_weight.device)
+    sq_pack_gemm.pack_dequant(output, q_weight, lut.view(-1), bitwidth)
+    return output
+@torch.library.custom_op("ours_lib::sq_pack_gemm_inplace_simt", mutates_args={"output"})
+def sq_pack_gemm_inplace_simt(x: torch.Tensor, q_weight: torch.Tensor, lut: torch.Tensor, output:torch.Tensor, bitwidth:int) -> None:
+    sq_pack_gemm.pack_gemm(x, output, q_weight, lut, bitwidth)
+@sq_pack_gemm_inplace_simt.register_fake
+def _(x, q_weight, lut, output, bitwidth):
+    return None
+"""
+VQ Pack SIMT
+"""
+codeT_sz = 32
+for vec_sz in [2,4]:
+    if vec_sz == 2:
+        lut_bits_list = [3,4,5,6,7,8,9,10,11,12]
+    elif vec_sz == 4:
+        lut_bits_list = [6,7,8,9,10,11,12]
+    for lut_bits in lut_bits_list:
+        code_n = lut_bits
+        recons_n = int(vec_sz * 16)
+        for maxm in [1,2,4,8]:
+            name = f"vq_pack_gemm_simt_{maxm}_{vec_sz}_{lut_bits}"
+            kernel_name = f"vq_pack_gemm.vq_pack_gemm_{maxm}_{lut_bits}_{vec_sz}_{code_n}_{codeT_sz}_{recons_n}"
+            torch.library.define(f"ours_lib::{name}", "(Tensor x, Tensor q_weight, Tensor lut) -> Tensor")
+            exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(x: torch.Tensor, q_weight: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+    return torch.zeros(x.shape[0], 1, q_weight.shape[0], dtype=torch.float16, device=x.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(x: torch.Tensor, q_weight: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+    output = torch.zeros(x.shape[0], 1, q_weight.shape[0], dtype=torch.float16, device=x.device)
+    {kernel_name}(x, output, q_weight.view(torch.uint32), lut)
+    return output
+    """)
+        name = f"vq_pack_dequant_simt_{vec_sz}_{lut_bits}"
+        kernel_name = f"vq_pack_gemm.vq_pack_dequant_{lut_bits}_{vec_sz}_{code_n}_{codeT_sz}_{recons_n}"
+        torch.library.define(f"ours_lib::{name}", "(Tensor q_weight, Tensor lut, int m, int k) -> Tensor")
+        exec(f"""\
+@torch.library.register_fake("ours_lib::{name}")
+def {name}_abstract(q_weight: torch.Tensor, lut: torch.Tensor, m: int, k: int) -> torch.Tensor:
+    return torch.zeros(m, k, dtype=torch.float16, device=q_weight.device)
+@torch.library.impl("ours_lib::{name}", "cuda")
+def {name}_cuda(q_weight: torch.Tensor, lut: torch.Tensor, m: int, k: int) -> torch.Tensor:
+    output = torch.zeros(m, k, dtype=torch.float16, device=q_weight.device)
+    {kernel_name}(output, q_weight.view(torch.uint32), lut)
+    return output
+    """)
+if __name__ == "__main__":
+    # layer = QTIPLinearTCQ(4096, 4096, 16, 16, 16, 8, 1, 9, False, torch.float16)
+    # print(layer._info())
+    # x = torch.randn(4, 4096)
+    # print(layer(x).shape)
+    layer = CombLinearTCQ(4096, 4096, 16, 16, (2048, 2048), 16, (3, 4), 2, 9, False)
+    print(layer._info())
+    layer.forward(torch.randn(1, 4096).cuda())

lib/linear/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (17.5 kB). View file

lib/linear/__pycache__/comb_linear.cpython-311.pyc ADDED Viewed

Binary file (19.5 kB). View file

lib/linear/__pycache__/incoherent_linear.cpython-311.pyc ADDED Viewed

Binary file (42.7 kB). View file

lib/linear/__pycache__/quantized_linear.cpython-311.pyc ADDED Viewed

Binary file (6.42 kB). View file

lib/linear/__pycache__/tcq_linear.cpython-311.pyc ADDED Viewed

Binary file (6.88 kB). View file

lib/linear/__pycache__/vq_linear.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

lib/linear/comb_linear.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import torch
+import torch.nn as nn
+import math
+class CombLinearTCQ(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        td_x,
+        td_y,
+        out_part,
+        L,  # trellis window
+        KV,  # bpw
+        V,  # vq dim
+        tlut_bits,
+        bias=False,
+        dtype=torch.float16,
+    ):
+        super().__init__()
+        assert len(out_part) == 2 and len(KV) == 2
+        assert out_part[0] + out_part[1] == out_features
+        self.in_features = in_features
+        self.out_features = out_features
+        self.out_part = out_part
+        self.td_x = td_x
+        self.td_y = td_y
+        self.L = L
+        self.KV = KV
+        self.V = V
+        self.tlut_bits = tlut_bits
+        self.dtype = dtype
+        # packed into int16
+        self.register_buffer(
+            'trellis1',
+            torch.zeros((out_part[0] // td_x) * (in_features // td_y),
+                        math.ceil((td_x * td_y) * KV[0] / 16 / V),
+                        dtype=torch.int16))
+        self.register_buffer(
+            'trellis2',
+            torch.zeros((out_part[1] // td_x) * (in_features // td_y),
+                        math.ceil((td_x * td_y) * KV[1] / 16 / V),
+                        dtype=torch.int16))
+        self.tlut = nn.Parameter(torch.zeros(2**tlut_bits,
+                                                V,
+                                                dtype=torch.float16),
+                                    requires_grad=False)
+        if bias:
+            self.register_buffer('bias', torch.ones(out_features))
+        else:
+            self.bias = None
+        if out_part[0] == out_part[1]:
+            self.use_comb_kernel = True
+        else:
+            self.use_comb_kernel = False
+    def _info(self):
+        info = {
+            "in_features": self.in_features,
+            "out_features": self.out_features,
+            "td_x": self.td_x,
+            "td_y": self.td_y,
+            "out_part": self.out_part,
+            "L": self.L,
+            "KV": self.KV,
+            "V": self.V,
+            'tlut_bits': self.tlut_bits,
+            "dtype": self.dtype,
+            "trellis1": self.trellis1.detach().cpu(),
+            "trellis2": self.trellis2.detach().cpu(),
+            "tlut": self.tlut.detach().cpu().half(),
+            "bias": self.bias.detach().cpu() if self.bias is not None else None,
+        }
+        return info
+    def forward(self, inp, **kwargs):
+        x = inp.view(-1, self.in_features)
+        bs = x.shape[0]
+        m, k = self.out_features, self.in_features
+        if bs <= 8:
+            if self.use_comb_kernel:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_gemm_tcq_comb_{self.out_features}_{bs}_{k}_{self.tlut_bits}_{self.KV[0]}_{self.KV[1]}"
+                )
+                x = wrapper(self.trellis1, self.trellis2, x, self.tlut)
+            else:
+                wrapper1 = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_gemm_tcq_{self.out_part[0]}_{bs}_{k}_{self.tlut_bits}_{self.KV[0]}"
+                )
+                wrapper2 = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_gemm_tcq_{self.out_part[1]}_{bs}_{k}_{self.tlut_bits}_{self.KV[1]}"
+                )
+                x1 = wrapper1(self.trellis1, x, self.tlut)
+                x2 = wrapper2(self.trellis2, x, self.tlut)
+                x = torch.cat([x1, x2], dim=1)
+        else:
+            if self.use_comb_kernel:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_tcq_comb_{self.tlut_bits}_{self.KV[0]}_{self.KV[1]}"
+                )
+                with torch.no_grad():
+                    dq = wrapper(self.trellis1, self.trellis2, self.tlut, self.out_features, k)
+                x = x.to(dq.dtype) @ dq.T
+            else:
+                wrapper1 = getattr(
+                torch.ops.ours_lib,
+                f"decompress_tcq_{self.tlut_bits}_{self.KV[0]}"
+                )
+                wrapper2 = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_tcq_{self.tlut_bits}_{self.KV[1]}"
+                )
+                with torch.no_grad():
+                    dq1 = wrapper1(self.trellis1, self.tlut, self.out_part[0], k)
+                    dq2 = wrapper2(self.trellis2, self.tlut, self.out_part[1], k)
+                x1 = x.to(dq1.dtype) @ dq1.T
+                x2 = x.to(dq2.dtype) @ dq2.T
+                x = torch.cat([x1, x2], dim=1)
+        return x.view(*inp.shape[:-1], m).to(inp.dtype)
+    @staticmethod
+    def gen_layer_from_info(info):
+        layer = CombLinearTCQ(info["in_features"], info["out_features"], info["td_x"], info["td_y"], info["out_part"], info["L"], info["KV"], info["V"], info["tlut_bits"], info["bias"] is not None, info["dtype"])
+        layer.trellis1.data.copy_(info["trellis1"])
+        layer.trellis2.data.copy_(info["trellis2"])
+        layer.tlut.data.copy_(info["tlut"])
+        if info["bias"] is not None:
+            layer.bias.data.copy_(info["bias"])
+        return layer
+    def get_weight(self):
+        wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_tcq_comb_{self.tlut_bits}_{self.KV[0]}_{self.KV[1]}"
+                )
+        dq = wrapper(self.trellis1, self.trellis2, self.tlut, self.out_features, self.in_features)
+        return dq
+class CombtLinearTCQ(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        td_x,
+        td_y,
+        in_part,
+        L,  # trellis window
+        KV,  # bpw
+        V,  # vq dim
+        tlut_bits,
+        bias=False,
+        dtype=torch.float16,
+    ):
+        super().__init__()
+        assert len(in_part) == 2 and len(KV) == 2
+        assert in_part[0] + in_part[1] == in_features
+        self.in_features = in_features
+        self.out_features = out_features
+        self.in_part = in_part
+        self.td_x = td_x
+        self.td_y = td_y
+        self.L = L
+        self.KV = KV
+        self.V = V
+        self.tlut_bits = tlut_bits
+        self.dtype = dtype
+        # packed into int16
+        self.register_buffer(
+            'trellis1',
+            torch.zeros((out_features // td_x) * (in_part[0] // td_y),
+                        math.ceil((td_x * td_y) * KV[0] / 16 / V),
+                        dtype=torch.int16))
+        self.register_buffer(
+            'trellis2',
+            torch.zeros((out_features // td_x) * (in_part[1] // td_y),
+                        math.ceil((td_x * td_y) * KV[1] / 16 / V),
+                        dtype=torch.int16))
+        self.tlut = nn.Parameter(torch.zeros(2**tlut_bits,
+                                                V,
+                                                dtype=torch.float16),
+                                    requires_grad=False)
+        if bias:
+            self.register_buffer('bias', torch.ones(out_features))
+        else:
+            self.bias = None
+        if in_part[0] == in_part[1]:
+            self.use_comb_kernel = True
+        else:
+            self.use_comb_kernel = False
+    def _info(self):
+        info = {
+            "in_features": self.in_features,
+            "out_features": self.out_features,
+            "td_x": self.td_x,
+            "td_y": self.td_y,
+            "in_part": self.in_part,
+            "L": self.L,
+            "KV": self.KV,
+            "V": self.V,
+            'tlut_bits': self.tlut_bits,
+            "dtype": self.dtype,
+            "trellis1": self.trellis1.detach().cpu(),
+            "trellis2": self.trellis2.detach().cpu(),
+            "tlut": self.tlut.detach().cpu().half(),
+            "bias": self.bias.detach().cpu() if self.bias is not None else None,
+        }
+        return info
+    def forward(self, inp, **kwargs):
+        x = inp.view(-1, self.in_features)
+        bs = x.shape[0]
+        m, k = self.out_features, self.in_features
+        if bs <= 8:
+            if self.use_comb_kernel:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_gemm_tcq_combt_{self.out_features}_{bs}_{k}_{self.tlut_bits}_{self.KV[0]}_{self.KV[1]}"
+                )
+                x = wrapper(self.trellis1, self.trellis2, x, self.tlut)
+            else:
+                wrapper1 = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_gemm_tcq_{m}_{bs}_{self.in_part[0]}_{self.tlut_bits}_{self.KV[0]}"
+                )
+                wrapper2 = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_gemm_tcq_{m}_{bs}_{self.in_part[1]}_{self.tlut_bits}_{self.KV[1]}"
+                )
+                x1 = wrapper1(self.trellis1, x[:, :self.in_part[0]], self.tlut)
+                x2 = wrapper2(self.trellis2, x[:, self.in_part[0]:], self.tlut)
+                x = x1 + x2
+        else:
+            if self.use_comb_kernel:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_tcq_combt_{self.tlut_bits}_{self.KV[0]}_{self.KV[1]}"
+                )
+                with torch.no_grad():
+                    dq = wrapper(self.trellis1, self.trellis2, self.tlut, self.out_features, k)
+                x = x.to(dq.dtype) @ dq.T
+            else:
+                wrapper1 = getattr(
+                torch.ops.ours_lib,
+                f"decompress_tcq_{self.tlut_bits}_{self.KV[0]}"
+                )
+                wrapper2 = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_tcq_{self.tlut_bits}_{self.KV[1]}"
+                )
+                with torch.no_grad():
+                    dq1 = wrapper1(self.trellis1, self.tlut, m, self.in_part[0])
+                    dq2 = wrapper2(self.trellis2, self.tlut, m, self.in_part[1])
+                x1 = x[:, :self.in_part[0]].to(dq1.dtype) @ dq1.T
+                x2 = x[:, self.in_part[0]:].to(dq2.dtype) @ dq2.T
+                x = x1 + x2
+        return x.view(*inp.shape[:-1], m).to(inp.dtype)
+    @staticmethod
+    def gen_layer_from_info(info):
+        layer = CombtLinearTCQ(info["in_features"], info["out_features"], info["td_x"], info["td_y"], info["in_part"], info["L"], info["KV"], info["V"], info["tlut_bits"], info["bias"] is not None, info["dtype"])
+        layer.trellis1.data.copy_(info["trellis1"])
+        layer.trellis2.data.copy_(info["trellis2"])
+        layer.tlut.data.copy_(info["tlut"])
+        if info["bias"] is not None:
+            layer.bias.data.copy_(info["bias"])
+        return layer
+    def get_weight(self):
+        wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"decompress_tcq_combt_{self.tlut_bits}_{self.KV[0]}_{self.KV[1]}"
+                )
+        dq = wrapper(self.trellis1, self.trellis2, self.tlut, self.out_features, self.in_features)
+        return dq
+    @staticmethod
+    def merge_infos(info1, info2):
+        assert info1["in_features"] == info2["in_features"]
+        assert info1["td_x"] == info2["td_x"]
+        assert info1["td_y"] == info2["td_y"]
+        assert info1["L"] == info2["L"]
+        assert info1["KV"] == info2["KV"]
+        assert info1["V"] == info2["V"]
+        assert info1["tlut_bits"] == info2["tlut_bits"]
+        if not torch.allclose(info1["tlut"], info2["tlut"], atol=1e-4):
+            print("warning: tlut is not close. it is unexpected behavior if you do not use dummy quantizers.")
+        assert info1["bias"] is None and info2["bias"] is None
+        assert info1["dtype"] == info2["dtype"]
+        info = {}
+        info["in_features"] = info1["in_features"]
+        info["out_features"] = info1["out_features"] + info2["out_features"]
+        info["td_x"] = info1["td_x"]
+        info["td_y"] = info1["td_y"]
+        info["L"] = info1["L"]
+        info["KV"] = info1["KV"]
+        info["V"] = info1["V"]
+        info["tlut_bits"] = info1["tlut_bits"]
+        info["bias"] = None
+        info["dtype"] = info1["dtype"]
+        info["trellis1"] = torch.cat([info1["trellis1"], info2["trellis1"]], dim=0)
+        info["trellis2"] = torch.cat([info1["trellis2"], info2["trellis2"]], dim=0)
+        info["tlut"] = info1["tlut"]
+        info["in_part"] = info1["in_part"]
+        return info
+if __name__ == "__main__":
+    layer = CombLinearTCQ(4096, 4096, 16, 16, (2048, 2048), 16, (3, 4), 2, 9, False)
+    print(layer._info())
+    layer.forward(torch.randn(1, 4096).cuda())

lib/linear/incoherent_linear.py ADDED Viewed

	@@ -0,0 +1,639 @@

+import torch
+import torch.nn as nn
+from lib.utils import (get_hadK, matmul_hadU_cuda, matmul_hadUt_cuda, matmul_hadUt, matmul_hadU, matmul_hadUt_head, matmul_hadU_head, matmul_hadU_head_cuda, matmul_hadUt_head_cuda)
+from lib.linear.tcq_linear import QTIPLinearTCQ
+from lib.linear.vq_linear import VQLinearPackTensorCore, VQLinearPackSIMT
+from lib.linear.comb_linear import CombLinearTCQ, CombtLinearTCQ
+from transformers.activations import ACT2FN
+from transformers.models.llama.configuration_llama import LlamaConfig
+from typing import Optional, Tuple
+from model.llama import LlamaRotaryEmbedding, repeat_kv, apply_rotary_pos_emb, Cache
+def make_linear(info, use_simt=False):
+    if "tcq" in info["quant_info"]["quantizer_str"]:
+        linear = QTIPLinearTCQ.gen_layer_from_info(info["linear_info"])
+    elif use_simt and ("sq" in info["quant_info"]["quantizer_str"] or "vq" in info["quant_info"]["quantizer_str"] or "ldlq" in info["quant_info"]["quantizer_str"]):
+        linear = VQLinearPackSIMT.gen_layer_from_info(info["linear_info"])
+    elif "sq" in info["quant_info"]["quantizer_str"] or "vq" in info["quant_info"]["quantizer_str"] or "ldlq" in info["quant_info"]["quantizer_str"]:
+        linear = VQLinearPackTensorCore.gen_layer_from_info(info["linear_info"])
+    elif "tcomb" in info["quant_info"]["quantizer_str"]:
+        linear = CombtLinearTCQ.gen_layer_from_info(info["linear_info"])
+    elif "comb" in info["quant_info"]["quantizer_str"]:
+        linear = CombLinearTCQ.gen_layer_from_info(info["linear_info"])
+    else:
+        linear = nn.Linear(info["in_features"], info["out_features"], bias=False)
+    return linear
+class IncoherentSdpaAttention(nn.Module):
+    def __init__(self, config, merge_qk=False, merge_kv=False, merge_qv=False, merge_qkv=False, layer_idx=None, dtype=torch.float16):
+        super().__init__()
+        self.config = config
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.kv_out = self.hidden_size * self.num_key_value_heads // self.num_heads
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.q_proj = None
+        self.k_proj = None
+        self.v_proj = None
+        self.o_proj = None
+        self.qk_proj = None
+        self.qkv_proj = None
+        self.kv_proj = None
+        self.dtype=dtype
+        self.layer_idx = layer_idx
+        self.register_buffer("SU_qkv", torch.ones(config.hidden_size, dtype=self.dtype))
+        self.register_buffer("SU_o", torch.ones(config.hidden_size, dtype=self.dtype))
+        hidden_had, hidden_K = get_hadK(config.hidden_size)
+        hidden_had_T = hidden_had.T.contiguous().cuda() if hidden_had is not None else None
+        self.register_buffer('Wscale_qkv', torch.ones(config.hidden_size + 2 * self.kv_out, dtype=self.dtype), persistent=False)
+        self.register_buffer('Wscale_o', torch.ones(config.hidden_size, dtype=self.dtype), persistent=False)
+        self.register_buffer('had_left_qkv_T', hidden_had_T, persistent=False)
+        self.register_buffer('had_left_o_T', hidden_had_T, persistent=False)
+        self.hidden_K = hidden_K
+        self.scale = 64.0
+        self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
+        self.merge_qk = merge_qk
+        self.merge_kv = merge_kv
+        self.merge_qv = merge_qv
+        self.merge_qkv = merge_qkv
+        assert sum([self.merge_qk, self.merge_kv, self.merge_qv, self.merge_qkv]) <= 1, "Only one of merge_qk, merge_kv, merge_qv, merge_qkv can be True"
+    def compute_qkv(self, input):
+        n = len(self.SU_qkv)
+        x = input.view(-1, n).half()
+        x = matmul_hadU_cuda(x * self.SU_qkv, self.had_left_qkv_T, self.hidden_K) / self.scale
+        if self.merge_qkv:
+            qkv = self.qkv_proj(x.half()) * self.Wscale_qkv * self.scale
+            q, k, v = qkv.split([self.hidden_size, self.kv_out, self.kv_out], dim=-1)
+        elif self.merge_qk:
+            qk = self.qk_proj(x.half()) * self.Wscale_qkv[:self.hidden_size + self.kv_out] * self.scale
+            q, k = qk.split([self.hidden_size, self.kv_out], dim=-1)
+            v = self.v_proj(x.half()) * self.Wscale_qkv[self.hidden_size + self.kv_out:] * self.scale
+        elif self.merge_kv:
+            kv = self.kv_proj(x.half()) * self.Wscale_qkv[self.hidden_size:] * self.scale
+            k, v = kv.split([self.kv_out, self.kv_out], dim=-1)
+            q = self.q_proj(x.half()) * self.Wscale_qkv[:self.hidden_size] * self.scale
+        elif self.merge_qv:
+            qv = self.qv_proj(x.half()) * self.Wscale_qkv[:self.hidden_size + self.kv_out] * self.scale
+            q, v = qv.split([self.hidden_size, self.kv_out], dim=-1)
+            k = self.k_proj(x.half()) * self.Wscale_qkv[self.hidden_size + self.kv_out:] * self.scale
+        else:
+            q = self.q_proj(x.half()) * self.Wscale_qkv[:self.hidden_size] * self.scale
+            k = self.k_proj(x.half()) * self.Wscale_qkv[self.hidden_size:self.hidden_size + self.kv_out] * self.scale
+            v = self.v_proj(x.half()) * self.Wscale_qkv[self.hidden_size + self.kv_out:] * self.scale
+        return q.view(*input.shape[:-1], n), k.view(*input.shape[:-1], self.kv_out), v.view(*input.shape[:-1], self.kv_out)
+    def compute_o(self, input):
+        n = len(self.SU_o)
+        x = input.view(-1, n).half()
+        x = matmul_hadU_cuda(x * self.SU_o, self.had_left_o_T, self.hidden_K) / self.scale
+        x = self.o_proj(x.half()) * self.Wscale_o * self.scale
+        return x.view(*input.shape[:-1], n)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[
+            Tuple[torch.Tensor,
+                  torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        # query_states = self.q_proj(hidden_states)
+        # key_states = self.k_proj(hidden_states)
+        # value_states = self.v_proj(hidden_states)
+        query_states, key_states, value_states = self.compute_qkv(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
+                                         self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position
+            }
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, :key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states.to(query_states.device),
+            value_states.to(query_states.device),
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        # attn_output = self.o_proj(attn_output)
+        attn_output = self.compute_o(attn_output)
+        return attn_output, None, past_key_value
+    @staticmethod
+    def gen_layer_from_info(config, layer_idx, info_q, info_k, info_v, info_o, merge_qk=False, merge_qv=False, merge_kv=False, merge_qkv=False, dummy=False, use_simt=False, use_simt_q=None, use_simt_k=None, use_simt_v=None, use_simt_o=None):
+        attn = IncoherentSdpaAttention(config, merge_qk=merge_qk, merge_qv=merge_qv, merge_kv=merge_kv, merge_qkv=merge_qkv, layer_idx=layer_idx)
+        if not dummy:
+            attn.SU_qkv.data.copy_(info_q["SU"])
+            attn.SU_o.data.copy_(info_o["SU"])
+            if not merge_qv:
+                attn.Wscale_qkv.data.copy_(torch.cat([info_q["Wscale"], info_k["Wscale"], info_v["Wscale"]], dim=-1))
+            else:
+                attn.Wscale_qkv.data.copy_(torch.cat([info_q["Wscale"], info_v["Wscale"], info_k["Wscale"]], dim=-1))
+            attn.Wscale_o.data.copy_(info_o["Wscale"])
+        use_simt_q = use_simt if use_simt_q is None else use_simt_q
+        use_simt_k = use_simt if use_simt_k is None else use_simt_k
+        use_simt_v = use_simt if use_simt_v is None else use_simt_v
+        use_simt_o = use_simt if use_simt_o is None else use_simt_o
+        if merge_qk:
+            to_merged, rest, target_proj, rest_proj = [info_q, info_k], [info_v], "qk_proj", ["v_proj"]
+        elif merge_kv:
+            to_merged, rest, target_proj, rest_proj = [info_k, info_v], [info_q], "kv_proj", ["q_proj"]
+        elif merge_qv:
+            to_merged, rest, target_proj, rest_proj = [info_q, info_v], [info_k], "qv_proj", ["k_proj"]
+        elif merge_qkv:
+            to_merged, rest, target_proj, rest_proj = [info_q, info_k, info_v], [], "qkv_proj", []
+        else:
+            to_merged, rest, target_proj, rest_proj = [], [info_q, info_k, info_v], "", ["q_proj", "k_proj", "v_proj"]
+        if merge_qk or merge_kv or merge_qv or merge_qkv:
+            if merge_kv: use_simt_merge = use_simt_k
+            elif merge_qk or merge_qv or merge_qkv: use_simt_merge = use_simt_q
+            else: raise ValueError
+            if "tcq" in to_merged[0]["quant_info"]["quantizer_str"]:
+                merged_linear = QTIPLinearTCQ
+            elif "sq" in to_merged[0]["quant_info"]["quantizer_str"] or "vq" in to_merged[0]["quant_info"]["quantizer_str"] or "ldlq" in to_merged[0]["quant_info"]["quantizer_str"]:
+                merged_linear = VQLinearPackTensorCore if not use_simt_merge else VQLinearPackSIMT
+            elif "tcomb" in to_merged[0]["quant_info"]["quantizer_str"]:
+                merged_linear = CombtLinearTCQ
+            elif "comb" in to_merged[0]["quant_info"]["quantizer_str"]:
+                merged_linear = CombLinearTCQ
+            merged = to_merged[0]['linear_info']
+            for info in to_merged[1:]:
+                merged = merged_linear.merge_infos(merged, info['linear_info'])
+            setattr(attn, target_proj, merged_linear.gen_layer_from_info(merged))
+        for info, proj in zip(rest, rest_proj):
+            if proj == "q_proj": cur_use_simt = use_simt_q
+            elif proj == "k_proj": cur_use_simt = use_simt_k
+            elif proj == "v_proj": cur_use_simt = use_simt_v
+            else: raise ValueError
+            setattr(attn, proj, make_linear(info, use_simt=cur_use_simt))
+        attn.o_proj = make_linear(info_o, use_simt=use_simt_o)
+        return attn
+    @staticmethod
+    def gen_layer_from_quantizer_str_and_key(config, layer_idx, quant_dir, quantizer_str_q, quantizer_str_k, quantizer_str_v, quantizer_str_o, key_q, key_k, key_v, key_o, merge_qk=False, merge_qv=False, merge_kv=False, merge_qkv=False, dummy=False, use_simt=False, use_simt_q=None, use_simt_k=None, use_simt_v=None, use_simt_o=None):
+        if not dummy:
+            info_q = torch.load(f"{quant_dir}/{quantizer_str_q}/{key_q}.pt")
+            info_k = torch.load(f"{quant_dir}/{quantizer_str_k}/{key_k}.pt")
+            info_v = torch.load(f"{quant_dir}/{quantizer_str_v}/{key_v}.pt")
+            info_o = torch.load(f"{quant_dir}/{quantizer_str_o}/{key_o}.pt")
+        else:
+            from lib.utils.mem_op import get_dummy_quant_results
+            from lib.config import MODEL_KEYS
+            model_key = MODEL_KEYS[config._name_or_path]
+            info_q = get_dummy_quant_results(model_key, f"self_attn.q_proj", quantizer_str_q)
+            info_k = get_dummy_quant_results(model_key, f"self_attn.k_proj", quantizer_str_k)
+            info_v = get_dummy_quant_results(model_key, f"self_attn.v_proj", quantizer_str_v)
+            info_o = get_dummy_quant_results(model_key, f"self_attn.o_proj", quantizer_str_o)
+        return IncoherentSdpaAttention.gen_layer_from_info(config, layer_idx, info_q, info_k, info_v, info_o, merge_qk=merge_qk, merge_qv=merge_qv, merge_kv=merge_kv, merge_qkv=merge_qkv, dummy=dummy, use_simt=use_simt, use_simt_q=use_simt_q, use_simt_k=use_simt_k, use_simt_v=use_simt_v, use_simt_o=use_simt_o)
+class IncoherentMLP(nn.Module):
+    """
+        only support left only and unified SU for upgates.
+    """
+    def __init__(self, hidden_size, intermediate_size, hidden_act, merge_ug=False, bias=False, dtype=torch.float16):
+        super().__init__()
+        assert bias is False, "bias is not supported"
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dtype = dtype
+        self.up_proj = None
+        self.gate_proj = None
+        self.ug_proj = None
+        self.down_proj = None
+        self.register_buffer("SU_ug", torch.ones(hidden_size, dtype=self.dtype))
+        self.register_buffer("SU_dp", torch.ones(intermediate_size, dtype=self.dtype))
+        hidden_had, hidden_K = get_hadK(hidden_size)
+        inter_had, inter_K = get_hadK(intermediate_size)
+        inter_had_T = inter_had.T.contiguous().cuda() if inter_had is not None else None
+        hidden_had_T = hidden_had.T.contiguous().cuda() if hidden_had is not None else None
+        self.register_buffer('Wscale_ug', torch.ones(intermediate_size * 2, dtype=self.dtype), persistent=False)
+        self.register_buffer('Wscale_dp', torch.ones(hidden_size, dtype=self.dtype), persistent=False)
+        self.register_buffer('had_left_ug_T', hidden_had_T, persistent=False)
+        self.register_buffer('had_left_dp_T', inter_had_T, persistent=False)
+        self.hidden_K = hidden_K
+        self.inter_K = inter_K
+        self.scale = 64.0
+        self.act_fn = ACT2FN[hidden_act]
+        self.merge_ug = merge_ug
+    def forward(self, input):
+        n = len(self.SU_ug)
+        x = input.view(-1, n).half()
+        x = self.compute_ug(x)
+        x = self.compute_dp(x)
+        return x.view(*input.shape[:-1], n).to(input.dtype)
+    def compute_ug(self, x):
+        x = matmul_hadU_cuda(x * self.SU_ug, self.had_left_ug_T, self.hidden_K) / self.scale
+        if self.merge_ug:
+            x = self.ug_proj(x.half()) * self.Wscale_ug * self.scale
+            x_up, x_gate = x.split(self.intermediate_size, dim=-1)
+        else:
+            x_up = self.up_proj(x.half()) * self.Wscale_ug[:self.intermediate_size] * self.scale
+            x_gate = self.gate_proj(x.half()) * self.Wscale_ug[self.intermediate_size:] * self.scale
+        x = self.act_fn(x_gate) * x_up
+        return x
+    def compute_dp(self, x):
+        x = matmul_hadU_cuda(x * self.SU_dp, self.had_left_dp_T, self.inter_K) / self.scale
+        x = self.down_proj(x.half()) * self.Wscale_dp * self.scale
+        return x
+    @staticmethod
+    def gen_layer_from_info(config, info_up, info_gate, info_down, merge_ug=False, dummy=False, use_simt=False, use_simt_u=None, use_simt_g=None, use_simt_d=None):
+        mlp = IncoherentMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    merge_ug=merge_ug
+                )
+        if not dummy:
+            mlp.SU_ug.data.copy_(info_up["SU"])
+            mlp.SU_dp.data.copy_(info_down["SU"])
+            mlp.Wscale_ug.data.copy_(torch.cat([info_up["Wscale"], info_gate["Wscale"]], dim=-1))
+            mlp.Wscale_dp.data.copy_(info_down["Wscale"])
+        use_simt_u = use_simt if use_simt_u is None else use_simt_u
+        use_simt_g = use_simt if use_simt_g is None else use_simt_g
+        use_simt_d = use_simt if use_simt_d is None else use_simt_d
+        if merge_ug:
+            if "tcq" in info_up["quant_info"]["quantizer_str"]:
+                linear_info_ug = QTIPLinearTCQ.merge_infos(info_up['linear_info'], info_gate['linear_info'])
+                mlp.ug_proj = QTIPLinearTCQ.gen_layer_from_info(linear_info_ug)
+            elif "vq" in info_up["quant_info"]["quantizer_str"] or "sq" in info_up["quant_info"]["quantizer_str"] or "ldlq" in info_up["quant_info"]["quantizer_str"]:
+                if use_simt_u:
+                    linear_info_ug = VQLinearPackSIMT.merge_infos(info_up['linear_info'], info_gate['linear_info'])
+                    mlp.ug_proj = VQLinearPackSIMT.gen_layer_from_info(linear_info_ug)
+                else:
+                    linear_info_ug = VQLinearPackTensorCore.merge_infos(info_up['linear_info'], info_gate['linear_info'])
+                    mlp.ug_proj = VQLinearPackTensorCore.gen_layer_from_info(linear_info_ug)
+            elif "tcomb" in info_up["quant_info"]["quantizer_str"]:
+                linear_info_ug = CombtLinearTCQ.merge_infos(info_up['linear_info'], info_gate['linear_info'])
+                mlp.ug_proj = CombtLinearTCQ.gen_layer_from_info(linear_info_ug)
+            elif "comb" in info_up["quant_info"]["quantizer_str"]:
+                linear_info_ug = CombLinearTCQ.merge_infos(info_up['linear_info'], info_gate['linear_info'])
+                mlp.ug_proj = CombLinearTCQ.gen_layer_from_info(linear_info_ug)
+        else:
+            mlp.up_proj = make_linear(info_up, use_simt=use_simt_u)
+            mlp.gate_proj = make_linear(info_gate, use_simt=use_simt_g)
+        mlp.down_proj = make_linear(info_down, use_simt=use_simt_d)
+        return mlp
+    @staticmethod
+    def gen_layer_from_quantizer_str_and_key(config, quant_dir, quantizer_str_up, quantizer_str_gate, quantizer_str_down, key_up, key_gate, key_down, merge_ug=False, dummy=False, use_simt=False, use_simt_u=None, use_simt_g=None, use_simt_d=None):
+        if not dummy:
+            info_up = torch.load(f"{quant_dir}/{quantizer_str_up}/{key_up}.pt")
+            info_gate = torch.load(f"{quant_dir}/{quantizer_str_gate}/{key_gate}.pt")
+            info_down = torch.load(f"{quant_dir}/{quantizer_str_down}/{key_down}.pt")
+        else:
+            from lib.utils.mem_op import get_dummy_quant_results
+            from lib.config import MODEL_KEYS
+            model_key = MODEL_KEYS[config._name_or_path]
+            info_up = get_dummy_quant_results(model_key, f"mlp.up_proj", quantizer_str_up)
+            info_gate = get_dummy_quant_results(model_key, f"mlp.gate_proj", quantizer_str_gate)
+            info_down = get_dummy_quant_results(model_key, f"mlp.down_proj", quantizer_str_down)
+        return IncoherentMLP.gen_layer_from_info(config, info_up, info_gate, info_down, merge_ug, dummy=dummy, use_simt=use_simt, use_simt_u=use_simt_u, use_simt_g=use_simt_g, use_simt_d=use_simt_d)
+class IncoherentLinear(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        hadU,
+        hadV,
+        bias=False,
+        dtype=torch.float16,
+        use_linear=True,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.dtype = dtype
+        if use_linear:
+            self.linear = nn.Linear(in_features, out_features, bias=False, dtype=dtype)
+        else:
+            self.linear = None
+        if bias:
+            self.register_buffer('bias', torch.ones(out_features))
+        else:
+            self.bias = None
+        self.register_buffer("SU", torch.ones(in_features, dtype=self.dtype))
+        self.register_buffer("SV", torch.ones(out_features, dtype=self.dtype))
+        self.hadU = hadU
+        self.hadV = hadV
+        had_left, K_left = get_hadK(hadU)
+        had_right, K_right = get_hadK(hadV)
+        if had_left is not None:
+            had_left_T = had_left.T.contiguous().cuda()
+        else:
+            had_left_T = None
+        if had_right is not None:
+            had_right = had_right.cuda()
+        self.register_buffer('Wscale', torch.ones(out_features, dtype=self.dtype), persistent=True)
+        self.register_buffer('had_right', had_right, persistent=False)
+        self.register_buffer('had_left_T', had_left_T, persistent=False)
+        self.K_left = K_left
+        self.K_right = K_right
+        self.scale = 32.0
+        self.rot_info = "all"
+        self.skip_l = False
+        self.skip_r = False
+    def apply_rot_info(self):
+        if self.rot_info == "all":
+            self.skip_l = False
+            self.skip_r = False
+        elif self.rot_info == "skip_l":
+            self.skip_l = True
+            self.skip_r = False
+        elif self.rot_info == "skip_r":
+            self.skip_l = False
+            self.skip_r = True
+        elif self.rot_info == "skip_lr":
+            self.skip_l = True
+            self.skip_r = True
+        else:
+            raise ValueError(f"Invalid rot_info: {self.rot_info}")
+    def save_info(self, path, quant_info=None):
+        linear_info = self.linear._info()
+        info = {
+            "in_features": self.in_features,
+            "out_features": self.out_features,
+            "hadU": self.hadU,
+            "hadV": self.hadV,
+            "dtype": self.dtype,
+            "scale": self.scale,
+            "Wscale": self.Wscale.detach().cpu(),
+            "rot_info": self.rot_info,
+            "linear_info": linear_info,
+            "bias": self.bias.detach().cpu() if self.bias is not None else None,
+            "SU": self.SU.detach().cpu(),
+            "SV": self.SV.detach().cpu(),
+            "quant_info": quant_info,
+        }
+        torch.save(info, path)
+    def forward(self, input):
+        n, m = len(self.SU), len(self.SV)
+        x = input.view(-1, n).half()#.to(torch.float32)
+        if not self.skip_l:
+            x = x * self.SU
+            x = matmul_hadU_head_cuda(x, self.had_left_T, self.K_left, self.hadU) / self.scale
+        else:
+            # x = x * self.SU
+            x = x / self.scale
+        x = self.linear(x.half()) * self.Wscale#.float()
+        if not self.skip_r:
+            x = matmul_hadU_head_cuda(x, self.had_right, self.K_right, self.hadV)
+            x = x.to(self.SV.device) * (self.SV * self.scale)
+        else:
+            # x = x.to(self.SV.device) * (self.SV * self.scale)
+            x = x * self.scale
+        x = x.view(*input.shape[:-1], m).to(input.dtype)
+        if self.bias is not None:
+            x = x + self.bias
+        return x
+    @staticmethod
+    def gen_layer_from_info(info, merge_layers=False, dummy=False, use_simt=False):
+        layer = IncoherentLinear(
+                    in_features=info["in_features"],
+                    out_features=info["out_features"],
+                    hadU=info["hadU"] if "hadU" in info else info["in_features"],
+                    hadV=info["hadV"] if "hadV" in info else info["out_features"],
+                    bias=info["bias"] is not None,
+                    dtype=info["dtype"],
+                    use_linear=False,
+                )
+        if not dummy:
+            if info["bias"] is not None:
+                layer.bias.data.copy_(info["bias"])
+            layer.SU.data.copy_(info["SU"])
+            layer.SV.data.copy_(info["SV"])
+            layer.Wscale.data.copy_(info["Wscale"])
+        if info["quant_info"] is not None:
+            if "tcq" in info["quant_info"]["quantizer_str"]:
+                layer.linear = QTIPLinearTCQ.gen_layer_from_info(info["linear_info"])
+            elif "sq" in info["quant_info"]["quantizer_str"] or "vq" in info["quant_info"]["quantizer_str"] or "ldlq" in info["quant_info"]["quantizer_str"]:
+                if use_simt:
+                    layer.linear = VQLinearPackSIMT.gen_layer_from_info(info["linear_info"])
+                else:
+                    layer.linear = VQLinearPackTensorCore.gen_layer_from_info(info["linear_info"])
+            elif "tcomb" in info["quant_info"]["quantizer_str"]:
+                layer.linear = CombtLinearTCQ.gen_layer_from_info(info["linear_info"])
+            elif "comb" in info["quant_info"]["quantizer_str"]:
+                layer.linear = CombLinearTCQ.gen_layer_from_info(info["linear_info"])
+        if "rot_info" in info["quant_info"]:
+            layer.rot_info = info["quant_info"]["rot_info"]
+        elif "rot_info" in info:
+            layer.rot_info = info["rot_info"]
+        else:
+            layer.rot_info = "all"
+        if merge_layers:
+            layer.apply_rot_info()
+        return layer
+    @staticmethod
+    def gen_layer_from_quantizer_str_and_key(config, quant_dir, quantizer_str, key, merge_layers=False, dummy=False, use_simt=False):
+        if not dummy:
+            info = torch.load(f"{quant_dir}/{quantizer_str}/{key}.pt")
+        else:
+            from lib.utils.mem_op import get_dummy_quant_results
+            from lib.config import MODEL_KEYS
+            model_key = MODEL_KEYS[config._name_or_path]
+            layer_id = key.split("_")[0]
+            layer_key = key.replace(f"{layer_id}_", "")
+            info = get_dummy_quant_results(model_key, f"{layer_key}", quantizer_str)
+        return IncoherentLinear.gen_layer_from_info(info, merge_layers=merge_layers, dummy=dummy, use_simt=use_simt)
+def calc_kurtosis(W):
+    # W: (-1, n), ||W[i]|| = 1
+    W = W.to(torch.float64)
+    return W.pow(4).mean(-1) - 3.0
+def calc_skewness(W):
+    # W: (-1, n), ||W[i]|| = 1
+    W = W.to(torch.float64)
+    return W.pow(3).mean(-1)
+def linear_to_incoherent(linear, hadU, hadV, SU=None, SV=None, lnorm=None, rot_info="all"):
+    dtype_ = torch.float32
+    dtype = linear.weight.data.dtype
+    device = linear.weight.device
+    inc_linear = IncoherentLinear(linear.in_features, linear.out_features, hadU, hadV, linear.bias is not None, dtype)
+    if SU is None:
+        SU = ((torch.randn(linear.in_features, dtype=dtype_) > 0.0) * 2.0 - 1.0).to(device)
+    if SV is None:
+        SV = ((torch.randn(linear.out_features, dtype=dtype_) > 0.0) * 2.0 - 1.0).to(device)
+    if lnorm is not None:
+        lnorm = lnorm.to(device).to(dtype_)
+    if linear.bias is not None:
+        inc_linear.bias.data.copy_(linear.bias)
+    W = linear.weight.data.to(dtype_)
+    Wr = (W.to(torch.float64).to(device) @ torch.diag(lnorm).to(torch.float64)).to(dtype_).to(device) if lnorm is not None else W
+    if hadU != linear.in_features or hadV != linear.out_features:
+        Wr = matmul_hadUt_head(matmul_hadUt_head(Wr.T.to(device) * SV, hadV).T * SU, hadU)
+    else:
+        Wr = matmul_hadUt(matmul_hadUt(Wr.T.to(device) * SV).T * SU)
+    # Wscale = Wr.square().mean().sqrt()
+    Wscale = Wr.to(torch.float64).square().mean(-1).sqrt().view(-1, 1).to(dtype_)
+    Wr = Wr / Wscale
+    inc_linear.SU.data.copy_(SU.to(inc_linear.SU.dtype))
+    # inc_linear.SV.data.copy_((SV * Wscale).to(inc_linear.SV.dtype))
+    inc_linear.SV.data.copy_((SV).to(inc_linear.SV.dtype))
+    inc_linear.Wscale.data.copy_(Wscale.view(-1))
+    inc_linear.linear.weight.data.copy_(Wr.to(inc_linear.linear.weight.dtype))
+    inc_linear = inc_linear.to(dtype).to(device)
+    inc_linear.rot_info = rot_info
+    inc_linear.apply_rot_info()
+    # anal weight
+    kurt = calc_kurtosis(inc_linear.linear.weight.data)
+    skew = calc_skewness(inc_linear.linear.weight.data)
+    # print(kurt.pow(2).mean(), kurt.mean(), kurt.std(), kurt.max(), kurt.min())
+    # print pretty
+    print(f"E[kurt^2]: {kurt.pow(2).mean():.4f}, E[kurt]: {kurt.mean():.4f}, std[kurt]: {kurt.std():.4f}, max[kurt]: {kurt.max():.4f}, min[kurt]: {kurt.min():.4f}")
+    print(f"E[skew^2]: {skew.pow(2).mean():.4f}, E[skew]: {skew.mean():.4f}, std[skew]: {skew.std():.4f}, max[skew]: {skew.max():.4f}, min[skew]: {skew.min():.4f}")
+    kurt_stats = {
+        "kurt_pow2_mean": kurt.pow(2).mean(),
+        "kurt_mean": kurt.mean(),
+        "kurt_std": kurt.std(),
+        "kurt_max": kurt.max(),
+        "kurt_min": kurt.min(),
+        "skew_pow2_mean": skew.pow(2).mean(),
+        "skew_mean": skew.mean(),
+        "skew_std": skew.std(),
+        "skew_max": skew.max(),
+        "skew_min": skew.min(),
+    }
+    return inc_linear, kurt_stats
+if __name__ == "__main__":
+    linear = nn.Linear(4096, 4096, bias=True, dtype=torch.float16).cuda()
+    # linear.weight.data = linear.weight.data * 5 + 4
+    inc_linear = linear_to_incoherent(linear)
+    ran = torch.randn(4096, 4096, dtype=torch.float16).cuda()
+    orig = linear(ran)
+    inc = inc_linear(ran)
+    print((orig - inc).pow(2).mean() / orig.pow(2).mean())

lib/linear/quantized_linear.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import math
+import time
+import torch
+import torch.nn as nn
+from lib.codebook import bitshift
+from lib.utils import (clean, dtype_from_str, get_hadK, has_kernel,
+                       matmul_hadU_cuda)
+class QuantizedLinear(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        td_x,
+        td_y,
+        L,  # trellis window
+        K,  # bpw
+        V,  # vq dim
+        tlut_bits,  # tunable LUT bits
+        decode_mode,
+        bias=False,
+        dtype=torch.float16,
+        mode='eval',
+        use_prev_kernel=True,
+        grad_ckpt=False,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.td_x = td_x
+        self.td_y = td_y
+        self.L = L
+        self.K = K
+        self.V = V
+        self.tlut_bits = tlut_bits
+        self.decode_mode = decode_mode
+        self.register_buffer('rcp', torch.tensor(0))
+        # TP rank, not used unless rcp != 0
+        self.register_buffer('tp_rank', torch.tensor(8))
+        self.dtype = dtype
+        # packed into int16
+        self.register_buffer(
+            'trellis',
+            torch.zeros((out_features // td_x) * (in_features // td_y),
+                        math.ceil((td_x * td_y) * K / 16),
+                        dtype=torch.int16))
+        if decode_mode in ['lut', 'quantlut', 'quantlut_sym']:
+            self.tlut = nn.Parameter(torch.zeros(2**tlut_bits,
+                                                 V,
+                                                 dtype=torch.float16),
+                                     requires_grad=False)
+        else:
+            self.tlut = None
+        if bias:
+            self.register_buffer('bias', torch.ones(out_features))
+        else:
+            self.bias = None
+        self.register_buffer("SU", torch.ones(in_features, dtype=self.dtype))
+        self.register_buffer("SV", torch.ones(out_features,
+                                              dtype=torch.float32))
+        self.built_codebook_class = False
+        self.built_graph = False
+        had_left, K_left = get_hadK(in_features)
+        had_right, K_right = get_hadK(out_features)
+        self.register_buffer('had_left', had_left, persistent=False)
+        self.register_buffer('had_right', had_right, persistent=False)
+        self.K_left = K_left
+        self.K_right = K_right
+        self.mode = mode
+        self.use_prev_kernel = use_prev_kernel
+        self.grad_ckpt = grad_ckpt
+        self.has_kernel = has_kernel(decode_mode, L, K, V, tlut_bits, td_x,
+                                     td_y)
+    def forward(self, input):
+        if self.grad_ckpt:
+            return self.ckpt_forward(input)
+        return self.no_ckpt_forward(input)
+    def ckpt_forward(self, input):
+        return torch.utils.checkpoint.checkpoint(self.no_ckpt_forward,
+                                                 input,
+                                                 use_reentrant=True)
+    def no_ckpt_forward(self, input):
+        if not self.built_codebook_class:
+            self.codebook_class = bitshift.BitshiftLinear(
+                self.td_x,
+                self.td_y,
+                self.L,
+                self.K,
+                self.V,
+                self.tlut_bits,
+                self.decode_mode,
+                dtype=self.dtype,
+                tlut=self.tlut,
+                has_kernel=self.has_kernel)
+            rcp = self.rcp.item()
+            del self.rcp
+            self.rcp = rcp
+            if self.mode == 'eval':
+                pass
+            elif self.mode == 'train-recons':
+                if not self.has_kernel:
+                    self.packed_trellis = self.trellis.cpu()
+                    unpacked_trellis = self.codebook_class.cb.unpack_trellis(
+                        self.trellis, self.td_x * self.td_y)
+                    self.trellis = unpacked_trellis
+                    clean()
+            elif self.mode == 'train-fixW':
+                self.codebook_class.cache_hatW(self.trellis, self.had_left,
+                                               self.had_right, self.K_left,
+                                               self.K_right, len(self.SV),
+                                               len(self.SU), self.rcp,
+                                               self.tp_rank)
+                self.trellis = self.trellis.cpu()
+                del self.had_left, self.had_right, self.K_left, self.K_right
+                clean()
+                self.had_left = None
+                self.had_right = None
+                self.K_left = None
+                self.K_right = None
+            else:
+                raise Exception
+            self.built_codebook_class = True
+        result = self.codebook_class(input,
+                                     self.trellis,
+                                     self.SU,
+                                     self.SV,
+                                     self.had_left,
+                                     self.had_right,
+                                     self.K_left,
+                                     self.K_right,
+                                     self.rcp,
+                                     self.tp_rank,
+                                     mode=self.mode,
+                                     use_prev_kernel=self.use_prev_kernel) + 0
+        if self.bias is not None:
+            return result + self.bias
+        return result

lib/linear/rotation.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from lib.utils import matmul_hadU_cuda, matmul_hadUt_cuda, matmul_hadUt_head, matmul_hadU_head, get_hadK
+import torch.nn as nn
+class RotateWeights(nn.Module):
+    def __init__(self, had_dim_U, had_dim_V, SU=None, SV=None):
+        super().__init__()
+        self.had_dim_U = had_dim_U
+        self.had_dim_V = had_dim_V
+        self.SU = SU
+        self.SV = SV
+        self.had_left_U, self.K_left_U = get_hadK(had_dim_U)
+        self.had_left_V, self.K_left_V = get_hadK(had_dim_V)
+    def apply_weights(self, weights):
+        return matmul_hadUt_head(matmul_hadUt_head(weights.T, self.had_left_U, self.K_left_U), self.had_left_V, self.K_left_V)

lib/linear/tcq_linear.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import torch
+import torch.nn as nn
+import math
+class QTIPLinearTCQ(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        td_x,
+        td_y,
+        L,  # trellis window
+        KV,  # bpw
+        V,  # vq dim
+        tlut_bits,
+        bias=False,
+        dtype=torch.float16,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.td_x = td_x
+        self.td_y = td_y
+        self.L = L
+        self.KV = KV
+        self.V = V
+        self.tlut_bits = tlut_bits
+        self.dtype = dtype
+        # packed into int16
+        self.register_buffer(
+            'trellis',
+            torch.zeros((out_features // td_x) * (in_features // td_y),
+                        math.ceil((td_x * td_y) * KV / 16 / V),
+                        dtype=torch.int16))
+        self.tlut = nn.Parameter(torch.zeros(2**tlut_bits,
+                                                V,
+                                                dtype=torch.float16),
+                                    requires_grad=False)
+        if bias:
+            self.register_buffer('bias', torch.ones(out_features))
+        else:
+            self.bias = None
+    def _info(self):
+        info = {
+            "in_features": self.in_features,
+            "out_features": self.out_features,
+            "td_x": self.td_x,
+            "td_y": self.td_y,
+            "L": self.L,
+            "KV": self.KV,
+            "V": self.V,
+            'tlut_bits': self.tlut_bits,
+            "dtype": self.dtype,
+            "trellis": self.trellis.detach().cpu(),
+            "tlut": self.tlut.detach().cpu().half(),
+            "bias": self.bias.detach().cpu() if self.bias is not None else None,
+        }
+        return info
+    def forward(self, inp, **kwargs):
+        x = inp.view(-1, self.in_features)#.to(torch.float32)
+        bs = x.shape[0]
+        m, k = self.out_features, self.in_features
+        if bs <= 8:
+            wrapper = getattr(
+                torch.ops.ours_lib,
+                f"decompress_gemm_tcq_{m}_{bs}_{k}_{self.tlut_bits}_{self.KV}")
+            x = wrapper(self.trellis, x, self.tlut)
+        else:
+            wrapper = getattr(
+                torch.ops.ours_lib,
+                f"decompress_tcq_{self.tlut_bits}_{self.KV}"
+            )
+            # dq = wrapper(self.trellis, self.tlut).to(x.dtype)
+            # x = x @ dq.T
+            with torch.no_grad():
+                dq = wrapper(self.trellis, self.tlut, m, k) #.to(x.dtype)
+            x = (x.to(dq.dtype) @ dq.T)#.to(x.dtype)
+        return x.view(*inp.shape[:-1], m).to(inp.dtype)
+    @staticmethod
+    def gen_layer_from_info(info):
+        layer = QTIPLinearTCQ(info["in_features"], info["out_features"], info["td_x"], info["td_y"], info["L"], info["KV"], info["V"], info["tlut_bits"], info["bias"] is not None, info["dtype"])
+        layer.trellis.data.copy_(info["trellis"])
+        layer.tlut.data.copy_(info["tlut"])
+        if info["bias"] is not None:
+            layer.bias.data.copy_(info["bias"])
+        return layer
+    @staticmethod
+    def merge_infos(info1, info2):
+        assert info1["in_features"] == info2["in_features"]
+        assert info1["td_x"] == info2["td_x"]
+        assert info1["td_y"] == info2["td_y"]
+        assert info1["L"] == info2["L"]
+        assert info1["KV"] == info2["KV"]
+        assert info1["V"] == info2["V"]
+        assert info1["tlut_bits"] == info2["tlut_bits"]
+        if not torch.allclose(info1["tlut"], info2["tlut"], atol=1e-4):
+            print("warning: tlut is not close. it is unexpected behavior if you do not use dummy quantizers.")
+        assert info1["bias"] is None and info2["bias"] is None
+        assert info1["dtype"] == info2["dtype"]
+        info = {}
+        info["in_features"] = info1["in_features"]
+        info["out_features"] = info1["out_features"] + info2["out_features"]
+        info["td_x"] = info1["td_x"]
+        info["td_y"] = info1["td_y"]
+        info["L"] = info1["L"]
+        info["KV"] = info1["KV"]
+        info["V"] = info1["V"]
+        info["tlut_bits"] = info1["tlut_bits"]
+        info["bias"] = None
+        info["dtype"] = info1["dtype"]
+        info["trellis"] = torch.cat([info1["trellis"], info2["trellis"]], dim=0)
+        info["tlut"] = info1["tlut"]
+        return info

lib/linear/vq_linear.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import torch
+import torch
+import torch.nn as nn
+class VQLinearPackTensorCore(nn.Module):
+    def __init__(self, in_features, out_features, lut_bits, vec_sz=2, bias=False, dtype=torch.half):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.lut_bits = lut_bits
+        self.dtype = dtype
+        self.vec_sz = vec_sz
+        self.register_buffer(
+            'qweight',
+            torch.randint(0, 4, (out_features, lut_bits*in_features // 32 // vec_sz), dtype=torch.int32, device='cuda')
+        )
+        self.register_buffer(
+            'lut',
+            torch.randn((2 ** lut_bits, vec_sz), dtype=self.dtype, device='cuda')
+        )
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.randn((out_features,), dtype=self.dtype, device='cuda')
+            )
+        else:
+            self.bias = None
+        self.vq_type = f"vq{self.vec_sz}" if self.vec_sz > 1 else "sq_dup" if lut_bits <= 4 else "sq"
+    def _info(self):
+        info = {
+            "in_features": self.in_features,
+            "out_features": self.out_features,
+            "lut_bits": self.lut_bits,
+            "dtype": self.dtype,
+            "vec_sz": self.vec_sz,
+            "qweight": self.qweight.detach().cpu(),
+            "lut": self.lut.detach().cpu().half(),
+            "bias": self.bias.detach().cpu() if self.bias is not None else None,
+        }
+        return info
+    def forward(self, inp, **kwargs):
+        x = inp.view(-1, self.in_features)
+        bs = x.shape[0]
+        m, k = self.out_features, self.in_features
+        if bs <= 8:
+            wrapper = getattr(
+                torch.ops.ours_lib,
+                f"decompress_gemm_{m}_{bs}_{k}_{self.lut_bits}_{self.vq_type}"
+            )
+            x = wrapper(self.qweight, x, self.lut)
+        else:
+            wrapper = getattr(
+                torch.ops.ours_lib,
+                f"decompress_{self.lut_bits}_{self.vq_type}"
+            )
+            with torch.no_grad():
+                dq = wrapper(self.qweight, self.lut, m, k)
+            x = (x.to(dq.dtype) @ dq.T)
+        return x.view(*inp.shape[:-1], m).to(inp.dtype)
+    @staticmethod
+    def gen_layer_from_info(info):
+        layer = VQLinearPackTensorCore(info["in_features"], info["out_features"], info["lut_bits"], info["vec_sz"], info["bias"] is not None, info["dtype"])
+        layer.qweight.data.copy_(info["qweight"])
+        layer.lut.data.copy_(info["lut"])
+        if info["bias"] is not None:
+            layer.bias.data.copy_(info["bias"])
+        return layer
+    @staticmethod
+    def merge_infos(info1, info2):
+        assert info1["in_features"] == info2["in_features"]
+        assert info1["lut_bits"] == info2["lut_bits"]
+        assert info1["vec_sz"] == info2["vec_sz"]
+        assert info1["bias"] is None and info2["bias"] is None
+        assert info1["dtype"] == info2["dtype"]
+        if not torch.allclose(info1["lut"], info2["lut"], atol=1e-4):
+            print("warning: lut is not close. it is unexpected behavior if you do not use dummy quantizers.")
+        info = {}
+        info["in_features"] = info1["in_features"]
+        info["out_features"] = info1["out_features"] + info2["out_features"]
+        info["lut_bits"] = info1["lut_bits"]
+        info["vec_sz"] = info1["vec_sz"]
+        info["bias"] = None
+        info["dtype"] = info1["dtype"]
+        info["qweight"] = torch.cat([info1["qweight"], info2["qweight"]], dim=0)
+        info["lut"] = info1["lut"]
+        return info
+class VQLinearPackSIMT(nn.Module):
+    def __init__(self, in_features, out_features, lut_bits, vec_sz=1, bias=False, dtype=torch.half):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.lut_bits = lut_bits
+        self.dtype = dtype
+        self.vec_sz = vec_sz
+        self.register_buffer(
+            'qweight',
+            torch.randint(0, 4, (out_features, lut_bits*in_features // 32 // vec_sz), dtype=torch.int32, device='cuda')
+        )
+        self.register_buffer(
+            'lut',
+            torch.randn((2 ** lut_bits, vec_sz), dtype=self.dtype, device='cuda')
+        )
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.randn((out_features,), dtype=self.dtype, device='cuda')
+            )
+        else:
+            self.bias = None
+    def _info(self):
+        info = {
+            "in_features": self.in_features,
+            "out_features": self.out_features,
+            "lut_bits": self.lut_bits,
+            "dtype": self.dtype,
+            "vec_sz": self.vec_sz,
+            "qweight": self.qweight.detach().cpu(),
+            "lut": self.lut.detach().cpu().half(),
+            "bias": self.bias.detach().cpu() if self.bias is not None else None,
+        }
+        return info
+    def forward(self, inp, **kwargs):
+        x = inp.view(-1, 1, self.in_features)
+        bs = x.shape[0]
+        m, k = self.out_features, self.in_features
+        if bs <= 8:
+            if self.vec_sz == 1:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"sq_pack_gemm_simt"
+                )
+                x = wrapper(x, self.qweight, self.lut, self.lut_bits)
+            else:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"vq_pack_gemm_simt_{bs}_{self.vec_sz}_{self.lut_bits}"
+                )
+                x = wrapper(x, self.qweight, self.lut)
+        else:
+            if self.vec_sz == 1:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"sq_pack_dequant_simt"
+                )
+                with torch.no_grad():
+                    dq = wrapper(self.qweight, self.lut, self.lut_bits, m, k)
+            else:
+                wrapper = getattr(
+                    torch.ops.ours_lib,
+                    f"vq_pack_dequant_simt_{self.vec_sz}_{self.lut_bits}"
+                )
+                with torch.no_grad():
+                    dq = wrapper(self.qweight, self.lut, m, k)
+            x = (x.to(dq.dtype) @ dq.T)
+        return x.view(*inp.shape[:-1], m).to(inp.dtype)
+    @staticmethod
+    def gen_layer_from_info(info):
+        layer = VQLinearPackSIMT(info["in_features"], info["out_features"], info["lut_bits"], info["vec_sz"], info["bias"] is not None, info["dtype"])
+        if info["vec_sz"] <= 2:
+            from lib.quantizer.quant_op import convert_tensor_core_to_simt
+            # qweight is stored in tensor core format in default.
+            # we should convert it to simt format.
+            converted_qweight = convert_tensor_core_to_simt(info["qweight"], info["out_features"], info["in_features"], info["vec_sz"], info["lut_bits"], code_n=info["lut_bits"])
+            layer.qweight.data.copy_(converted_qweight)
+        else:
+            layer.qweight.data.copy_(info["qweight"])
+        layer.lut.data.copy_(info["lut"])
+        if info["bias"] is not None:
+            layer.bias.data.copy_(info["bias"])
+        return layer
+    @staticmethod
+    def merge_infos(info1, info2):
+        assert info1["in_features"] == info2["in_features"]
+        assert info1["lut_bits"] == info2["lut_bits"]
+        assert info1["vec_sz"] == info2["vec_sz"]
+        assert info1["bias"] is None and info2["bias"] is None
+        assert info1["dtype"] == info2["dtype"]
+        if not torch.allclose(info1["lut"], info2["lut"], atol=1e-4):
+            print("warning: lut is not close. it is unexpected behavior if you do not use dummy quantizers.")
+        info = {}
+        info["in_features"] = info1["in_features"]
+        info["out_features"] = info1["out_features"] + info2["out_features"]
+        info["lut_bits"] = info1["lut_bits"]
+        info["vec_sz"] = info1["vec_sz"]
+        info["bias"] = None
+        info["dtype"] = info1["dtype"]
+        info["qweight"] = torch.cat([info1["qweight"], info2["qweight"]], dim=0)
+        info["lut"] = info1["lut"]
+        return info

lib/quantizer/__pycache__/comb_quant.cpython-311.pyc ADDED Viewed

Binary file (15 kB). View file

lib/quantizer/__pycache__/nuq_op.cpython-311.pyc ADDED Viewed

Binary file (23.7 kB). View file

lib/quantizer/__pycache__/pack_op.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

lib/quantizer/__pycache__/pack_op.general_pack_32-88.py311.1.nbc ADDED Viewed

Binary file (43.8 kB). View file

lib/quantizer/__pycache__/pack_op.general_pack_32-88.py311.nbi ADDED Viewed

Binary file (1.59 kB). View file

lib/quantizer/__pycache__/pack_op.pack_32-242.py311.1.nbc ADDED Viewed

Binary file (39.4 kB). View file

lib/quantizer/__pycache__/pack_op.pack_32-242.py311.nbi ADDED Viewed

Binary file (1.62 kB). View file

lib/quantizer/__pycache__/pack_op.pack_codes_32-186.py311.1.nbc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3053fa289922289be50c3c5f21b9101f8426050f437286e34141275fa858e8
+size 116854

lib/quantizer/__pycache__/pack_op.pack_codes_32-186.py311.nbi ADDED Viewed

Binary file (1.59 kB). View file

lib/quantizer/__pycache__/pack_op.pack_for_sq_pack_kernel-287.py311.1.nbc ADDED Viewed

Binary file (67.1 kB). View file

lib/quantizer/__pycache__/pack_op.pack_for_sq_pack_kernel-287.py311.nbi ADDED Viewed

Binary file (1.71 kB). View file

lib/quantizer/__pycache__/quant_op.cpython-311.pyc ADDED Viewed

Binary file (23.4 kB). View file

lib/quantizer/__pycache__/tcq_quant.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

lib/quantizer/__pycache__/vq_quant.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

lib/quantizer/__pycache__/vq_quant_ldlq.cpython-311.pyc ADDED Viewed

Binary file (6.71 kB). View file

lib/quantizer/comb_quant.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import torch
+from lib.quantizer.quant_op import load_hessian, load_group_hessian
+from lib.quantizer.tcq_quant import qtip_quantize_mat, linear_to_incoherent_for_tcq, Args
+from lib.linear import CombLinearTCQ, CombtLinearTCQ
+from lib.codebook.bitshift import bitshift_codebook
+from lib.utils import clean
+import time
+from lib.utils import block_LDL
+from lib.algo.ldlq import LDLQ_combt
+def pack_trellis(Qidxs, td_x, td_y, cb, m, n, KV, V):
+    Qidxs = Qidxs.cpu()
+    packed = cb.pack_trellis(
+        Qidxs.reshape(m // td_x, td_x, n // td_y,
+                        td_y // V).transpose(1, 2).reshape(
+                            -1, td_x * td_y // V))
+    packed_8 = packed.view(torch.uint8).view(-1, 2)
+    packed_4 = torch.cat([packed_8.unsqueeze(-1) & (2 ** 4 - 1), (packed_8.unsqueeze(-1) & (2 ** 8 - 2 ** 4)) >> 4], dim=-1).view(-1, 4).flip(
+                (-1, ))
+    packed_4 = packed_4.reshape(m // 16 // 2, 2, n // 16 // 2, 2, 16 * 16 // 8,
+                                KV).permute(0, 2, 4, 3, 1, 5).flip(
+                                    (-1, )).contiguous().flatten()
+    packed_8 = torch.sum(packed_4.view(-1, 2) * torch.Tensor([[1, 2 ** 4]]).to(torch.uint8), dim=-1).to(torch.uint8).contiguous()
+    packed = packed_8.view(torch.int16).reshape(packed.shape).cuda()
+    return packed
+def combt_quantize_mat(Wr, HRr, Wscale, cb1, cb2, td_x=16, td_y=16, KV=(4,5), V=2, use_hess=True):
+    (m, n) = Wr.shape
+    Wr = Wr.to(torch.float64)
+    HRr_orig = HRr.clone()
+    gs = HRr.shape[-1]
+    LRrs = []
+    diag = torch.arange(n, device=HRr.device)
+    if not use_hess:
+        eye = torch.eye(n, device=Wr.device, dtype=torch.float64)
+        LRr, D = block_LDL(eye, td_y)
+        LRr[diag, diag] = 0
+        LRrs.append(LRr)
+    else:
+        for i in range(gs):
+            LRr, D = block_LDL(HRr[:,:,i], td_y)
+            LRr[diag, diag] = 0
+            LRrs.append(LRr)
+    args = Args(td_x, td_y, V)
+    Qidxs_list = []
+    hatWr_list = []
+    for i in range(gs):
+        cur_Wr = Wr[m // gs * i:m // gs * (i+1)]
+        hatWr, Qidxs = LDLQ_combt(cur_Wr, LRrs[i], cb1.cuda(), cb2.cuda(), args, for_kernel=True)
+        hatWr_list.append(hatWr)
+        Qidxs_list.append(Qidxs)
+        torch._dynamo.reset()
+    hatWr = torch.cat(hatWr_list, dim=0)
+    Qidxs = torch.cat(Qidxs_list, dim=0)
+    assert hatWr.shape == Wr.shape, f"hatWr.shape {hatWr.shape} != Wr.shape {Wr.shape}"
+    packed1 = pack_trellis(Qidxs[:, :n//2//V].contiguous(), td_x, td_y, cb1, m, n//2, KV[0], V)
+    packed2 = pack_trellis(Qidxs[:, n//2//V:].contiguous(), td_x, td_y, cb2, m, n//2, KV[1], V)
+    Wr *= Wscale.reshape(-1, 1)
+    hatWr *= Wscale.reshape(-1, 1)
+    orig_err = (Wr - hatWr).pow(2).mean()
+    err = orig_err / Wr.pow(2).mean()
+    print(
+        f'err {err.item()} orig_err {orig_err.item()}'
+    )
+    quant_info = {
+        "quantizer": "combt_ldlq",
+        "td_x": td_x,
+        "td_y": td_y,
+        "KV": KV,
+        "V": V,
+        "tlut_bits": cb1.tlut_bits,
+        "use_hess": use_hess,
+        "orig_err": orig_err.item(),
+        "err": err.item(),
+    }
+    return packed1, packed2, hatWr, quant_info
+def inc_linear_to_inc_combt_linear(inc_linear, HRr, cb1, cb2, td_x=16, td_y=16, in_part=(2048, 2048), KV=(3, 4), V=2, scale_override=0.9, use_hess=True):
+    Wr = (inc_linear.linear.weight.data * scale_override).to(HRr.dtype)
+    Wscale = inc_linear.Wscale.data / scale_override
+    inc_linear.Wscale.data.copy_(Wscale)
+    assert in_part[0] + in_part[1] == Wr.shape[1], "in_part is not correct"
+    assert torch.allclose(cb1.tlut, cb2.tlut), "cb1 and cb2 must have the same tlut"
+    packed1, packed2, hatWr, quant_info = combt_quantize_mat(Wr, HRr, Wscale, cb1, cb2, td_x=td_x, td_y=td_y, KV=KV, V=V, use_hess=use_hess)
+    torch._dynamo.reset()
+    out_features, in_features = Wr.shape
+    comb_linear = CombtLinearTCQ(
+        in_features,
+        out_features,
+        td_x=td_x,
+        td_y=td_y,
+        in_part=in_part,
+        L=16,
+        KV=KV,
+        V=V,
+        tlut_bits=cb1.tlut_bits,
+        bias=inc_linear.bias is not None,
+        dtype=inc_linear.dtype,
+    )
+    comb_linear.trellis1.data.copy_(packed1)
+    comb_linear.trellis2.data.copy_(packed2)
+    comb_linear.tlut.data.copy_(cb1.tlut)
+    inc_linear.linear = comb_linear
+    return inc_linear, quant_info
+def inc_linear_to_inc_comb_linear(inc_linear, HRr, cb1, cb2, td_x=16, td_y=16, out_part=(2048, 2048), KV=(3, 4), V=2, scale_override=0.9, use_hess=True):
+    Wr = (inc_linear.linear.weight.data * scale_override).to(HRr.dtype)
+    Wscale = inc_linear.Wscale.data / scale_override
+    inc_linear.Wscale.data.copy_(Wscale)
+    assert out_part[0] + out_part[1] == Wr.shape[0], "out_part is not correct"
+    assert len(HRr.shape) == 3 and HRr.shape[0] == HRr.shape[1] and HRr.shape[-1] == 1, f"support only none-grouped hessian but shape: {HRr.shape}"
+    packed1, hatWr1, quant_info1 = qtip_quantize_mat(Wr[:out_part[0]], HRr, Wscale[:out_part[0]], cb1, td_x=td_x, td_y=td_y, KV=KV[0], V=V, use_hess=use_hess)
+    torch._dynamo.reset()
+    packed2, hatWr2, quant_info2 = qtip_quantize_mat(Wr[out_part[0]:], HRr, Wscale[out_part[0]:], cb2, td_x=td_x, td_y=td_y, KV=KV[1], V=V, use_hess=use_hess)
+    torch._dynamo.reset()
+    out_features, in_features = Wr.shape
+    comb_linear = CombLinearTCQ(
+        in_features,
+        out_features,
+        td_x=td_x,
+        td_y=td_y,
+        out_part=out_part,
+        L=16,
+        KV=KV,
+        V=V,
+        tlut_bits=cb1.tlut_bits,
+        bias=inc_linear.bias is not None,
+        dtype=inc_linear.dtype,
+    )
+    comb_linear.trellis1.data.copy_(packed1)
+    comb_linear.trellis2.data.copy_(packed2)
+    comb_linear.tlut.data.copy_(cb1.tlut)
+    hatWr = torch.cat([hatWr1, hatWr2], dim=0).to(HRr.dtype)
+    orig_err = (Wr - hatWr).pow(2).mean()
+    err = orig_err / Wr.pow(2).mean()
+    quant_info = {
+        "quantizer": "comb_ldlq",
+        "td_x": td_x,
+        "td_y": td_y,
+        "KV": KV,
+        "V": V,
+        "use_hess": use_hess,
+        "orig_err": orig_err.item(),
+        "err": err.item(),
+        "quant_info1": quant_info1,
+        "quant_info2": quant_info2,
+    }
+    inc_linear.linear = comb_linear
+    return inc_linear, quant_info
+def linear_to_comb_linear(target_layer, hess_path, cb1, cb2, scale_override=0.9, out_part=(2048, 2048), KV=[3, 4], V=2, use_hess=True, SU=None, SV=None, lnorm=None, hadU=None, hadV=None, rot_info="all", left_only=False, ghess_key=""):
+    assert torch.allclose(cb1.tlut, cb2.tlut), "cb1 and cb2 must have the same tlut"
+    t0 = time.time()
+    out_features, in_features = target_layer.weight.shape
+    if ghess_key == "":
+        HR = load_hessian(hess_path).cuda() if hess_path is not None else torch.eye(in_features, device="cuda", dtype=torch.float64).unsqueeze(-1)
+    else:
+        HR = load_group_hessian(hess_path, layer_key=ghess_key).cuda()
+    layer, HRr = linear_to_incoherent_for_tcq(target_layer, cb1, HR, scale_override, SU=SU, SV=SV, lnorm=lnorm, hadU=hadU, hadV=hadV, rot_info=rot_info, left_only=left_only)
+    HRr = HRr.cuda()
+    layer = layer.cuda()
+    layer, quant_info = inc_linear_to_inc_comb_linear(layer, HRr, cb1, cb2, scale_override=1.0, td_x=16, td_y=16, out_part=out_part, KV=KV, V=V, use_hess=use_hess)
+    quant_info["scale_override"] = scale_override
+    quant_info["hess_path"] = hess_path
+    quant_info["time"] = time.time() - t0
+    return layer.to(torch.float16), quant_info
+def linear_to_combt_linear(target_layer, hess_path, cb1, cb2, scale_override=0.9, in_part=(2048, 2048), KV=[3, 4], V=2, use_hess=True, SU=None, SV=None, lnorm=None, hadU=None, hadV=None, rot_info="all", left_only=False, ghess_key=""):
+    assert torch.allclose(cb1.tlut, cb2.tlut), "cb1 and cb2 must have the same tlut"
+    t0 = time.time()
+    out_features, in_features = target_layer.weight.shape
+    if ghess_key == "":
+        HR = load_hessian(hess_path).cuda() if hess_path is not None else torch.eye(in_features, device="cuda", dtype=torch.float64).unsqueeze(-1)
+    else:
+        HR = load_group_hessian(hess_path, layer_key=ghess_key).cuda()
+    layer, HRr = linear_to_incoherent_for_tcq(target_layer, cb1, HR, scale_override, SU=SU, SV=SV, lnorm=lnorm, hadU=hadU, hadV=hadV, rot_info=rot_info, left_only=left_only)
+    HRr = HRr.cuda()
+    layer = layer.cuda()
+    layer, quant_info = inc_linear_to_inc_combt_linear(layer, HRr, cb1, cb2, scale_override=1.0, td_x=16, td_y=16, in_part=in_part, KV=KV, V=V, use_hess=use_hess)
+    quant_info["scale_override"] = scale_override
+    quant_info["hess_path"] = hess_path
+    quant_info["time"] = time.time() - t0
+    return layer.to(torch.float16), quant_info

lib/quantizer/nuq_op.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import logging
+import torch
+import numpy as np
+import time
+from tqdm import tqdm
+from typing import Tuple
+def get_progress_bar(total: int, desc: str):
+    return tqdm(
+        total=total,
+        desc=desc,
+        bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
+    )
+@torch.no_grad()
+def objective_function(
+    W: torch.Tensor,
+    H: torch.Tensor,
+    P: torch.Tensor,
+    C: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Calculate the quantization error (objective value).
+    Args:
+    W: Weight matrix (row_count * group_count, group_size)
+    H: Hessian matrix (blk_num, group_size, group_size)
+    P: Assignment matrix (row_count * group_count, group_size//vec_sz, n_cluster)
+    C: Centroid matrix (n_cluster, vec_sz)
+    Returns:
+    Objective value (scalar)
+    """
+    device = torch.device("cuda")
+    P, C = P.to(device), C.to(device)
+    W_hat = torch.einsum('ijc,ck->ijk', P, C) # Shape: (row_count * group_count, group_size//vec_sz, vec_sz)
+    W_hat = W_hat.view(W_hat.shape[0], -1) # Shape: (row_count * group_count, group_size)
+    delta_w = W_hat - W
+    blk_num = H.shape[0]
+    blk_size = W.shape[0] // blk_num
+    delta_w = delta_w.reshape(blk_num, blk_size, delta_w.shape[-1])
+    objective_value = torch.einsum('nij,njk,nik->i', delta_w, H, delta_w)
+    total_error = objective_value.mean()
+    return total_error
+@torch.no_grad()
+def parallel_objective_function_sub(
+    W: torch.Tensor,  # Shape: (b, g_cd)
+    quadratic: torch.Tensor,  # Shape: (g_cd, g_cd)
+    linear: torch.Tensor,  # Shape: (b, g_cd)
+    W_hat_options: torch.Tensor,  # Shape: (b, g_cd, num_options)
+) -> torch.Tensor:
+    """
+    Calculate the quantization error (objective value), and return the list of errors for each options.
+    W_hat is a tensor with possible options concatenated along the last dimension.
+    Args:
+    W: Weight matrix (b, g_cd)
+    quadratic: torch.Tensor,  # Shape: (g_cd, g_cd)
+    linear: torch.Tensor,  # Shape: (b, g_cd)
+    W_hat_options: torch.Tensor,  # Shape: (b, g_cd, num_options)
+    Returns:
+    Possible objective values for each options (b, num_options)
+    """
+    device = torch.device("cuda")
+    W_hat_options = W_hat_options.to(device)
+    b, g_cd, num_options = W_hat_options.shape
+    delta_w_g = W_hat_options - W.unsqueeze(2).expand(-1, -1, num_options)
+    quadratic_term = torch.einsum('jk,ijp,ikp->ip', quadratic, delta_w_g, delta_w_g)
+    linear_term = torch.einsum('ij,ijp->ip', linear, delta_w_g)
+    total_error_quad = quadratic_term + linear_term
+    return total_error_quad
+def update_batch_P(
+    W: torch.Tensor,  # Shape: (b, group_size)
+    H: torch.Tensor,  # Shape: (blk_num, group_size, group_size)
+    P: torch.Tensor,  # Shape: (b, group_size // vec_sz, n_cluster)
+    C: torch.Tensor,  # Shape: (n_cluster, vec_sz)
+    iteration: int,
+    g_cd: int,        # Number of weights to update at a time
+    cd_cycles: int,
+    verbose: bool = False,
+):
+    device = torch.device("cuda")
+    C = C.to(device)
+    C_ = C.unsqueeze(0).expand(P.shape[0], -1, -1)
+    assignments_prev = P.argmax(dim=-1).to(device)  # Shape: (b, group_size // vec_sz)
+    b, d = assignments_prev.shape
+    n_cluster, vec_sz = C_.size(1), C_.size(2)
+    assert H.shape[0] == 1
+    H_ = H[0]
+    assignments = assignments_prev.clone()
+    update_size = cd_cycles * d
+    for update_start_idx in range(0, update_size, g_cd):
+        start_idx = update_start_idx % d
+        end_idx = min(start_idx + g_cd, d)
+        indices = torch.arange(start_idx * vec_sz, end_idx * vec_sz, device=device)
+        indices_assignments = torch.arange(start_idx, end_idx, device=device)
+        # Generate all possible assignments for the group
+        num_options = n_cluster ** g_cd
+        if num_options > 1e6:
+            print(f"Skipping group starting at index {start_idx} due to large number of assignments ({num_options}).")
+            continue
+        # Create all possible assignments for the group
+        from itertools import product
+        assignments_list = list(product(range(n_cluster), repeat=g_cd))
+        assignments_array = torch.tensor(assignments_list, device=device).T  # Shape: (g_cd, num_options)
+        assignments_array = assignments_array.unsqueeze(0).expand(b, -1, -1)  # Shape: (b, g_cd, num_options, vec_sz)
+        # Creating options for g_cd weights
+        C_expanded = C_.unsqueeze(1).expand(-1, g_cd, -1, -1) # Shape: (b, g_cd, n_cluster, vec_sz)
+        W_g_hat_options = torch.gather(C_expanded, dim=2, index=assignments_array.unsqueeze(-1).expand(-1, -1, -1, vec_sz)) # Shape: (b, g_cd, num_options, vec_sz)
+        # Gathering original quantized weights and compute linear & quadratic terms
+        # Expand C and gather original weights
+        C_expanded_org = C_.unsqueeze(1).expand(-1, d, -1, -1) # Shape: (b, d, n_cluster, vec_sz)
+        W_hat_org = torch.gather(C_expanded_org, dim=2, index=assignments.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 1, vec_sz)).squeeze(2) # Shape: (b, d, vec_sz)
+        # Compute deltas
+        delta_w_org = W_hat_org.view(b, -1) - W # Shape: (b, group_size)
+        # Get indices and slices
+        notg_indices = torch.cat([
+            torch.arange(0, start_idx * vec_sz, device=device),
+            torch.arange(end_idx * vec_sz, d * vec_sz, device=device)
+        ])
+        H_g_notg = H_[indices, :][:, notg_indices] # Shape: (g_cd * vec_sz, group_size - g_cd * vec_sz)
+        delta_w_org_notg = delta_w_org[:, notg_indices].to(device) # Shape: (b, group_size - g_cd * vec_sz)
+        # Compute quadratic and linear terms
+        quadratic = H_[indices, :][:, indices] # Shape: (g_cd * vec_sz, g_cd * vec_sz)
+        linear = 2 * torch.einsum('gd,id->ig', H_g_notg, delta_w_org_notg) # Shape: (b, g_cd * vec_sz)
+        W_g = W[:, indices] # Shape: (b, g_cd * vec_sz)
+        W_g_hat_options = W_g_hat_options.permute(0, 1, 3, 2).view(b, g_cd * vec_sz, num_options) # Shape: (b, g_cd * vec_sz, num_options)
+        # Objective function computation
+        cur_obj_value = parallel_objective_function_sub(W_g, quadratic, linear, W_g_hat_options) # Shape: (b, num_options)
+        # Update assignments
+        min_obj, argmin_obj = cur_obj_value.min(dim=1, keepdim=True)
+        expanded_argmin_obj = argmin_obj.unsqueeze(1).expand(-1, g_cd, -1).to(device)
+        assignments[:, indices_assignments] = assignments_array.gather(dim=2, index=expanded_argmin_obj).squeeze(-1) # Shape: (row_count * group_count, g_cd)
+    num_changed = (assignments_prev != assignments).sum().item()
+    total_assignments = assignments_prev.numel()
+    percentage_changed = num_changed / total_assignments * 100
+    if verbose:
+        logging.info(f"Percentage of assignments changed: {percentage_changed:.2f}%%")
+    # Convert assignments to one-hot encoding to create new P
+    P = torch.zeros((b, d, n_cluster), dtype=torch.float32, device=assignments.device)
+    P.scatter_(2, assignments.long().unsqueeze(-1), 1.0)
+    return P
+def update_P(
+    W: torch.Tensor,  # Shape: (row_count * group_count, group_size)
+    H: torch.Tensor,  # Shape: (blk_num, group_size, group_size)
+    P: torch.Tensor,  # Shape: (row_count * group_count, group_size//vec_sz, n_cluster)
+    C: torch.Tensor,  # Shape: (n_cluster, vec_sz)
+    iteration: int,
+    g_cd: int = 1,
+    cd_cycles: int = 4,
+):
+    n_cluster = C.shape[0]
+    batch_output_size = 4096 # * 32 // max(32, n_cluster)
+    device = torch.device("cuda")
+    updated_P_list = []
+    pb = get_progress_bar((W.size(0) - 1) // batch_output_size + 1, f"Updating P (cd_cycles={cd_cycles})")
+    for out_idx in range(0, W.size(0), batch_output_size):
+        torch.cuda.reset_peak_memory_stats()  # Reset memory stats at start of iteration
+        W_batch = W[out_idx:out_idx+batch_output_size].to(device)
+        P_batch = P[out_idx:out_idx+batch_output_size].to(device)
+        C_batch = C.to(device)
+        verbose = False # (out_idx == 0)
+        updated_P_batch = update_batch_P(W_batch, H, P_batch, C_batch, iteration, g_cd=g_cd, cd_cycles=cd_cycles, verbose=verbose).cpu()
+        updated_P_list.append(updated_P_batch)
+        pb.update(1)
+    pb.close()
+    # Log max CUDA memory usage
+    P = torch.cat(updated_P_list, dim=0)
+    return P
+def project_to_pd(H, eps=1e-2):
+    H_sym = (H + H.T) / 2
+    eigenvalues, eigenvectors = torch.linalg.eigh(H_sym)
+    eigenvalues = torch.clamp(eigenvalues, min=eps)
+    H_spd = eigenvectors @ torch.diag(eigenvalues) @ eigenvectors.T
+    H_spd = (H_spd + H_spd.T) / 2
+    H_spd = H_spd.to(H.dtype)
+    return H_spd
+import torch
+def kron_with_identity_vec(P: torch.Tensor, vec_sz: int) -> torch.Tensor:
+    B, d, c = P.shape
+    I_vec_sz = torch.eye(vec_sz, vec_sz).to(P.device)
+    P_expanded = P.unsqueeze(-1).unsqueeze(-1)             # (B, d, c, 1, 1)
+    I_expanded = I_vec_sz.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # (1, 1, 1, vec_sz, vec_sz)
+    out = P_expanded * I_expanded  # (B, d, c, vec_sz, vec_sz)
+    out = out.permute(0, 1, 3, 2, 4)  # (B, d, vec_sz, c, vec_sz)
+    out = out.reshape(B, d * vec_sz, c * vec_sz)
+    return out
+def update_C(
+    W: torch.Tensor,  # (row, gs)
+    H: torch.Tensor,  # (1, gs, gs)
+    P: torch.Tensor,  # (row, gs//vec_sz, n_cluster)
+    C: torch.Tensor,  # (n_cluster, vec_sz)
+    batch_size: int = 256
+):
+    device = W.device
+    dtype  = W.dtype
+    L = torch.linalg.cholesky(H[0])      # (gs, gs)
+    LT = L.transpose(0, 1)               # (gs, gs)
+    row, gs = W.shape
+    n_cluster, vec_sz = C.shape
+    A = torch.zeros(n_cluster * vec_sz, n_cluster * vec_sz, device=device, dtype=dtype)
+    b = torch.zeros(n_cluster * vec_sz,    device=device, dtype=dtype)
+    for start in range(0, row, batch_size):
+        end = min(start + batch_size, row)
+        # (B, gs // vec_sz, n_cluster)
+        P_chunk = P[start:end].to(device)
+        # (B, gs)
+        W_chunk = W[start:end].to(device)
+        B = P_chunk.shape[0]
+        # kronecker product with identity.
+        P_chunk_expanded = kron_with_identity_vec(P_chunk, vec_sz) # Shape: (B, gs, n_cluster * vec_sz)
+        X_temp = torch.einsum('ij,bjk->bik', LT, P_chunk_expanded) # Shape: (B, gs, n_cluster * vec_sz)
+        W_temp = torch.einsum('ij,bj->bi', LT, W_chunk) # Shape: (B, gs)
+        A += torch.einsum('bik,bil->kl', X_temp, X_temp) # Shape: (n_cluster * vec_sz, n_cluster * vec_sz)
+        b += torch.einsum('bik,bi->k', X_temp, W_temp) # Shape: (n_cluster * vec_sz)
+    C_flat = torch.linalg.solve(A, b) # Shape: (n_cluster * vec_sz)
+    C = C_flat.view(n_cluster, vec_sz) # Shape: (n_cluster, vec_sz)
+    return C
+def train_least_squares(
+    W: np.ndarray, # Shape: (row_count * group_count, group_size)
+    init_P: np.ndarray, # Shape: (row_count * group_count, group_size//vec_sz, n_cluster)
+    init_centroids: np.ndarray, # Shape: (n_cluster, vec_sz)
+    H: np.ndarray, # Shape: (blk_num, group_size, group_size)
+    num_iterations: int = 3,
+    cd_cycles: int = 4,
+    eig_threshold: float = 1e-3,
+) -> Tuple[np.ndarray, np.ndarray]:
+    device = torch.device("cuda")
+    P = torch.tensor(init_P, dtype=torch.float32, device="cpu")
+    C = torch.tensor(init_centroids, dtype=torch.float32, device="cpu")
+    W = torch.tensor(W, dtype=torch.float32).to(device)
+    H = torch.tensor(H, dtype=torch.float32).to(device)
+    # eigenvalues = torch.linalg.eigvalsh(H)
+    # for i in range(eigenvalues.shape[0]):
+    #     top_3_and_bottom_3 = [round(eig.item(), 2) for eig in torch.cat([eigenvalues[i][:3], eigenvalues[i][-3:]])]
+    #     logging.info(f"{i+1}-th H has Eigenvalues (top 3 and bottom 3): {top_3_and_bottom_3}, Projecting to PD with eps=1e-6 for numerical stability")
+    #     H[i] = project_to_pd(H[i], eps=1e-6)
+    #     eps = eig_threshold * 10
+    #     while not torch.all(eigenvalues[i] > eig_threshold):
+    #         top_3_and_bottom_3 = [round(eig.item(), 2) for eig in torch.cat([eigenvalues[i][:3], eigenvalues[i][-3:]])]
+    #         logging.info(f"{i+1}-th H not PD, Eigenvalues (top 3 and bottom 3): {top_3_and_bottom_3}, Projecting to PD with eps={eps}")
+    #         H[i] = project_to_pd(H[i], eps=eps)
+    #         eigenvalues = torch.linalg.eigvalsh(H)
+    #         eps *= 10
+    #     top_3_and_bottom_3 = [round(eig.item(), 2) for eig in torch.cat([eigenvalues[i][:3], eigenvalues[i][-3:]])]
+    #     logging.info(f"{i+1}-th H PD, Eigenvalues (top 3 and bottom 3): {top_3_and_bottom_3}")
+    diag = torch.arange(H.shape[1], device=device)
+    for i in range(H.shape[0]):
+        avg_diag = torch.mean(torch.diag(H[i]))
+        damp, prev_damp = 1e-7, 0.
+        while True:
+            try:
+                torch.linalg.cholesky(H[i])
+                logging.info(f"{i+1}-th H is PD, dampening factor={prev_damp:.2e}")
+                break
+            except Exception as e:
+                print(e)
+                logging.info(f"{i+1}-th H is not PD, try dampening with factor={damp:.2e}")
+                H[i, diag, diag] += (damp - prev_damp) * avg_diag
+                prev_damp = damp
+                damp *= 10
+                if damp > 1e0:
+                    exit()
+    best_obj_value = objective_function(W, H, P, C).item()
+    best_P, best_C = P.detach().cpu().clone(), C.detach().cpu().clone()
+    logging.info(f"Initial objective: {best_obj_value:.6f}")
+    log_dict = {"objective": [], "iteration": []}
+    log_dict["objective"].append(best_obj_value)
+    log_dict["iteration"].append(0)
+    for iteration in range(num_iterations):
+        start_time = time.time()
+        ######### Update P #########
+        if iteration > 0:
+            P = update_P(W, H, P, C, iteration, cd_cycles=cd_cycles)
+        # Compute objective value for logging
+        obj_value = objective_function(W, H, P, C).item()
+        logging.info(f"Iteration {iteration + 1} (P update): Objective: {obj_value:.4f}")
+        log_dict["objective"].append(obj_value)
+        log_dict["iteration"].append(iteration + 1)
+        ######### Update C #########
+        C = update_C(W, H, P, C)
+        # Check if the objective value improved
+        current_obj_value = objective_function(W, H, P, C).item()
+        log_dict["objective"].append(current_obj_value)
+        log_dict["iteration"].append(iteration + 1)
+        if current_obj_value < best_obj_value:
+            best_obj_value = current_obj_value
+            best_P, best_C = P.detach().cpu().clone(), C.detach().cpu().clone()
+            logging.info(f"Iteration {iteration + 1} (C update): Objective: {current_obj_value:.4f} | Improved and using this one.")
+        else:
+            logging.info(f"Iteration {iteration + 1} (C update): Objective: {current_obj_value:.4f} | Not improved. Using previous best values.")
+            P, C = best_P, best_C
+            break  # Early stopping
+        end_time = time.time()
+        logging.info(f"Iteration {iteration + 1} / {num_iterations} completed. "
+                     f"Update time: {end_time - start_time:.2f} sec")
+    end_time = time.time()
+    logging.info(f"Least squares training time: {end_time - start_time:.2f} seconds")
+    P = P.detach().cpu()
+    C = C.detach().cpu().to(torch.float32)
+    return P, C, log_dict
+def test():
+    from lib.utils.kmeans import fit_kmeans
+    # set seed
+    torch.manual_seed(0)
+    np.random.seed(0)
+    torch.cuda.manual_seed(0)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    vec_sz = 4
+    lut_size = (1 << (2 * vec_sz))
+    W = torch.randn(4096, 4096)
+    H = torch.randn(4096, 4096)
+    rand_data = torch.randn(10000, vec_sz)
+    C = fit_kmeans(rand_data, lut_size)[0]
+    print("C", C.shape)
+    W_vec = W.view(4096, 4096 // vec_sz, vec_sz)
+    W_vec = W_vec.unsqueeze(0) # Shape: (1, 4096, 2048, 2)
+    C_ = C.unsqueeze(1).unsqueeze(1) # Shape: (16, 1, 1, 2)
+    diff = W_vec - C_ # Shape: (16, 4096, 2048, 2)
+    dist_sq = diff.pow(2).sum(-1) # Shape: (16, 4096, 2048)
+    idx = dist_sq.argmin(dim=0) # Shape: (4096, 2048)
+    init_P = torch.zeros(4096, 4096 // vec_sz, lut_size)
+    init_P.scatter_(2, idx.unsqueeze(-1), 1)
+    H = H @ H.T
+    H = H + 1e-6 * torch.eye(4096, 4096)
+    H = H.unsqueeze(0)
+    P, C, log_dict = train_least_squares(
+        W=W.numpy(),
+        init_P=init_P.numpy(),
+        init_centroids=C.numpy(),
+        H=H.numpy(),
+        num_iterations=10,
+        cd_cycles=4,
+        eig_threshold=1e-3,
+    )
+    for i in range(len(log_dict["objective"])):
+        logging.info(f"Iteration {log_dict['iteration'][i]}: Objective: {log_dict['objective'][i]:.4f}")
+    print("P", P.shape)
+    print("C", C.shape)
+    # recons
+    W_hat = torch.einsum('ijc,ck->ijk', P, C)
+    W_hat = W_hat.view(W_hat.shape[0], -1)
+    err = (W - W_hat).pow(2).mean()
+    print("err", err.item())
+    dWHdW = (W - W_hat) @ H[0] @ (W - W_hat).T
+    err_tr = torch.trace(dWHdW) / H.shape[1]
+    print("err_tr", err_tr.item())
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format='[%(levelname)s] %(asctime)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    test()

lib/quantizer/pack_op.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import numba
+import numpy as np
+@numba.njit(cache=True)
+def general_pack(unpacked, nbits, codeT, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            unpacked: np.int (n_unpacked,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype
+        return: out_code (code_n,) dtype: codeT
+    '''
+    n_unpacked = unpacked.shape[0]
+    codeT_sz = codeT.itemsize * 8
+    assert n_unpacked * nbits / codeT_sz == code_n, "code_n must be equal to n_unpacked * nbits / codeT_sz"
+    out_code = np.zeros(code_n, dtype=codeT)
+    for i in range(n_unpacked):
+        val = codeT(unpacked[i])
+        offset = i * nbits
+        wIndex = offset // codeT_sz
+        bIndex = offset % codeT_sz
+        out_code[wIndex] |= (val << bIndex) & np.iinfo(codeT).max
+        bits_in_word = codeT_sz - bIndex
+        if bits_in_word < nbits:
+            upper = val >> bits_in_word
+            out_code[wIndex + 1] |= upper & np.iinfo(codeT).max
+    return out_code
+@numba.njit(cache=True)
+def general_pack_8(unpacked, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            unpacked: np.int (n_unpacked,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype
+        return: out_code (code_n,) dtype: codeT
+    '''
+    n_unpacked = unpacked.shape[0]
+    assert n_unpacked * nbits / 8 == code_n, "code_n must be equal to n_unpacked * nbits / 8"
+    out_code = np.zeros(code_n, dtype=np.uint8)
+    for i in range(n_unpacked):
+        val = unpacked[i]
+        offset = i * nbits
+        wIndex = offset // 8
+        bIndex = offset % 8
+        out_code[wIndex] |= (val << bIndex) & np.iinfo(np.uint8).max
+        bits_in_word = 8 - bIndex
+        if bits_in_word < nbits:
+            upper = val >> bits_in_word
+            out_code[wIndex + 1] |= upper & np.iinfo(np.uint8).max
+    return out_code
+@numba.njit(cache=True)
+def general_pack_16(unpacked, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            unpacked: np.int (n_unpacked,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype
+        return: out_code (code_n,) dtype: codeT
+    '''
+    n_unpacked = unpacked.shape[0]
+    assert n_unpacked * nbits / 16 == code_n, "code_n must be equal to n_unpacked * nbits / 16"
+    out_code = np.zeros(code_n, dtype=np.uint16)
+    for i in range(n_unpacked):
+        val = unpacked[i]
+        offset = i * nbits
+        wIndex = offset // 16
+        bIndex = offset % 16
+        out_code[wIndex] |= (val << bIndex) & np.iinfo(np.uint16).max
+        bits_in_word = 16 - bIndex
+        if bits_in_word < nbits:
+            upper = val >> bits_in_word
+            out_code[wIndex + 1] |= upper & np.iinfo(np.uint16).max
+    return out_code
+@numba.njit(cache=True)
+def general_pack_32(unpacked, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            unpacked: np.int (n_unpacked,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype
+        return: out_code (code_n,) dtype: codeT
+    '''
+    n_unpacked = unpacked.shape[0]
+    assert n_unpacked * nbits / 32 == code_n, "code_n must be equal to n_unpacked * nbits / 32"
+    out_code = np.zeros(code_n, dtype=np.uint32)
+    for i in range(n_unpacked):
+        val = unpacked[i]
+        offset = i * nbits
+        wIndex = offset // 32
+        bIndex = offset % 32
+        out_code[wIndex] |= (val << bIndex) & np.iinfo(np.uint32).max
+        bits_in_word = 32 - bIndex
+        if bits_in_word < nbits:
+            upper = val >> bits_in_word
+            out_code[wIndex + 1] |= upper & np.iinfo(np.uint32).max
+    return out_code
+@numba.njit(cache=True)
+def general_pack_64(unpacked, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            unpacked: np.int (n_unpacked,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype
+        return: out_code (code_n,) dtype: codeT
+    '''
+    n_unpacked = unpacked.shape[0]
+    assert n_unpacked * nbits / 64 == code_n, "code_n must be equal to n_unpacked * nbits / 64"
+    out_code = np.zeros(code_n, dtype=np.uint64)
+    for i in range(n_unpacked):
+        val = unpacked[i]
+        offset = i * nbits
+        wIndex = offset // 64
+        bIndex = offset % 64
+        out_code[wIndex] |= (val << bIndex) & np.iinfo(np.uint64).max
+        bits_in_word = 64 - bIndex
+        if bits_in_word < nbits:
+            upper = val >> bits_in_word
+            out_code[wIndex + 1] |= upper & np.iinfo(np.uint64).max
+    return out_code
+@numba.njit(cache=True)
+def pack_codes_8(codes, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            codes: np.int (n_samples,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype (uint64 or uint32 or uint16 or uint8)
+            code_n: int
+        return:
+            packed_codes (-1, code_n) dtype: codeT
+    '''
+    n_samples = codes.shape[0]
+    n_unpacked = code_n * 8 // nbits
+    packed_codes = np.zeros((n_samples // n_unpacked, code_n), dtype=np.uint8)
+    for i in range(n_samples // n_unpacked):
+        unpacked = codes[i * n_unpacked: (i + 1) * n_unpacked]
+        packed_codes[i] = general_pack_8(unpacked, nbits, code_n)
+    return packed_codes
+@numba.njit(cache=True)
+def pack_codes_16(codes, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            codes: np.int (n_samples,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype (uint64 or uint32 or uint16 or uint8)
+            code_n: int
+        return:
+            packed_codes (-1, code_n) dtype: codeT
+    '''
+    n_samples = codes.shape[0]
+    n_unpacked = code_n * 16 // nbits
+    packed_codes = np.zeros((n_samples // n_unpacked, code_n), dtype=np.uint16)
+    for i in range(n_samples // n_unpacked):
+        unpacked = codes[i * n_unpacked: (i + 1) * n_unpacked]
+        packed_codes[i] = general_pack_16(unpacked, nbits, code_n)
+    return packed_codes
+@numba.njit(cache=True)
+def pack_codes_32(codes, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            codes: np.int (n_samples,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype (uint64 or uint32 or uint16 or uint8)
+            code_n: int
+        return:
+            packed_codes (-1, code_n) dtype: codeT
+    '''
+    n_samples = codes.shape[0]
+    n_unpacked = code_n * 32 // nbits
+    packed_codes = np.zeros((n_samples // n_unpacked, code_n), dtype=np.uint32)
+    for i in range(n_samples // n_unpacked):
+        unpacked = codes[i * n_unpacked: (i + 1) * n_unpacked]
+        packed_codes[i] = general_pack_32(unpacked, nbits, code_n)
+    return packed_codes
+@numba.njit(cache=True)
+def pack_codes_64(codes, nbits, code_n):
+    '''
+        sequentially packing codes into codeT type array.
+        args:
+            codes: np.int (n_samples,) each entry is 0 .. 2 ** nbits - 1
+            nbits: int
+            codeT: np.dtype (uint64 or uint32 or uint16 or uint8)
+            code_n: int
+        return:
+            packed_codes (-1, code_n) dtype: codeT
+    '''
+    n_samples = codes.shape[0]
+    n_unpacked = code_n * 64 // nbits
+    packed_codes = np.zeros((int(n_samples // n_unpacked), code_n), dtype=np.uint64)
+    for i in range(n_samples // n_unpacked):
+        unpacked = codes[i * n_unpacked: (i + 1) * n_unpacked]
+        packed_codes[i] = general_pack_64(unpacked, nbits, code_n)
+    return packed_codes
+def pack_codes(codes, nbits, code_n, codeT_sz):
+    if codeT_sz == 8:
+        return pack_codes_8(codes, nbits, code_n)
+    elif codeT_sz == 16:
+        return pack_codes_16(codes, nbits, code_n)
+    elif codeT_sz == 32:
+        return pack_codes_32(codes, nbits, code_n)
+    elif codeT_sz == 64:
+        return pack_codes_64(codes, nbits, code_n)
+    else:
+        raise ValueError(f"Unsupported codeT_sz: {codeT_sz}")
+@numba.njit(cache=True)
+def pack_32(cluster_idx: np.ndarray, nbits: int) -> np.ndarray:
+    """
+    NumPy 버전의 pack_32 함수.
+    Parameters
+    ----------
+    cluster_idx : np.ndarray of shape (32,), dtype=int
+        길이 32의 정수 배열. (C 코드에서 const int* cluster_idx와 동일 역할)
+    nbits : int
+        각 정수를 몇 비트로 저장할지.
+    Returns
+    -------
+    out_code : np.ndarray of shape (out_size,), dtype=np.uint32
+        32개의 값(각각 nbits 비트)으로 구성된 연속 비트열을
+        32비트 워드(uint32) 단위로 나눈 결과.
+    """
+    # 32개의 값을 nbits비트씩 사용하면 총 32*nbits 비트가 필요.
+    # 이를 32비트 단위로 나누면 아래처럼 워드 수가 결정됨.
+    out_size = (32 * nbits + 31) // 32  # 올림
+    # 결과 버퍼 (np.uint32로)
+    out_code = np.zeros(out_size, dtype=np.uint32)
+    for i in range(32):
+        # cluster_idx[i]를 unsigned 처리
+        val = np.uint32(cluster_idx[i])
+        offset = i * nbits
+        wIndex = offset // 32  # 몇 번째 워드인지
+        bIndex = offset % 32   # 그 워드 내에서 몇 번째 비트부터 시작?
+        # 첫 번째 워드에 bIndex부터 nbits비트 중 일부 혹은 전부를 저장
+        out_code[wIndex] |= (val << bIndex) & np.uint32(0xFFFFFFFF)
+        # 현재 워드에 다 못 들어가는 나머지 비트가 있으면, 다음 워드에 저장
+        bits_in_word = 32 - bIndex
+        if bits_in_word < nbits:
+            upper = val >> bits_in_word
+            out_code[wIndex + 1] |= upper & np.uint32(0xFFFFFFFF)
+    return out_code
+@numba.njit(cache=True)
+def pack_for_sq_pack_kernel(unpacked_code: np.ndarray,
+                            nbits: int,
+                            blockDimX: int=32) -> np.ndarray:
+    """
+    Python 버전의 pack_for_sq_pack_kernel.
+    unpacked_code: shape = (N, K), dtype=uint32
+    nbits: int
+    blockDimX: int
+    return: Bcode (1D np.uint32 배열, 길이 = N * (K*nbits//32))
+    """
+    N, K = unpacked_code.shape
+    out_size = N * (K * nbits // 32)
+    Bcode = np.zeros(out_size, dtype=np.uint32)
+    K_iter = int(np.ceil(K / (32 * blockDimX)))
+    # 임시 버퍼
+    unpacked_Bcode_row = np.zeros(32, dtype=np.uint32)
+    for n_ in range(N):
+        eff_warp_size = blockDimX
+        for k_ in range(K_iter):
+            for thx in range(blockDimX):
+                if k_ == K // (32 * blockDimX):
+                    eff_warp_size = (K % (32 * blockDimX)) // 32
+                    if thx >= eff_warp_size:
+                        break
+                k_val  = k_ * 32 * blockDimX + 8 * thx
+                k_code = k_ * nbits * blockDimX + thx
+                # unpacked_Bcode_row에 32개 로드
+                idx_out = 0
+                for j in range(4):
+                    k_val_idx = k_val + 8 * j * eff_warp_size
+                    for i in range(8):
+                        unpacked_Bcode_row[idx_out] = unpacked_code[n_, k_val_idx + i]
+                        idx_out += 1
+                # pack_32 호출
+                Bcode_row = pack_32(unpacked_Bcode_row, nbits)
+                # Bcode에 저장
+                for j in range(nbits):
+                    k_code_idx = k_code + j * eff_warp_size
+                    Bcode[n_ * (K * nbits // 32) + k_code_idx] = Bcode_row[j]
+    return Bcode

lib/quantizer/quant_op.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import lib.utils as utils
+from lib.quantizer.pack_op import pack_codes, pack_for_sq_pack_kernel
+import torch
+_PERMUTE = torch.arange(256).reshape(2, 8, 2, 4, 2).permute(1, 3, 2, 0,
+                                                            4).flatten()
+_INV_PERMUTE = torch.zeros(256, dtype=torch.int64)
+_INV_PERMUTE[_PERMUTE] = torch.arange(256)
+def random_mat(N, K):
+    return torch.randn(N, K, dtype=torch.float16).cuda()
+def random_lut(nbits, vec_sz):
+    return torch.randn(1 << (nbits), vec_sz, dtype=torch.float16).cuda()
+def vq_pack_reshape_pack_routine(packed_codes, warp_size, code_n, N):
+    '''
+        packed_codes: torch.Tensor, shape = (N, -1, code_n)
+    '''
+    packed_codes = packed_codes.reshape(N, -1, code_n)
+    if packed_codes.shape[1] % warp_size == 0:
+        packed_codes = packed_codes.reshape(N, -1, warp_size, code_n).permute(0, 1, 3, 2).reshape(N, -1)
+    else:
+        full_warp_part = packed_codes.shape[1] - packed_codes.shape[1] % warp_size
+        packed_codes_full = packed_codes[:, :full_warp_part]
+        packed_codes_partial = packed_codes[:, full_warp_part:]
+        effective_warp_size = packed_codes_partial.shape[1]
+        packed_codes_full = packed_codes_full.reshape(N, -1, warp_size, code_n).permute(0, 1, 3, 2).reshape(N, -1)
+        packed_codes_partial = packed_codes_partial.reshape(N, 1, effective_warp_size, code_n).permute(0, 1, 3, 2).reshape(N, -1)
+        packed_codes = torch.cat([packed_codes_full, packed_codes_partial], dim=1)
+    return packed_codes
+def reshape_mat(mat, chunk_size, warp_size=32):
+    '''
+        mat: torch.Tensor, shape = (N, K)
+        chunk_size: int
+        return: reshaped_mat (N, K//chunk_size, chunk_size)
+    '''
+    N, K = mat.shape
+    assert K % 8 == 0 and chunk_size % 8 == 0, "K and chunk_size must be divisible by 8"
+    assert K % (chunk_size * warp_size) == 0, "K must be divisible by chunk_size * warp_size"
+    K_iter = K // (chunk_size * warp_size)
+    new_mat = mat.reshape(N, K_iter, chunk_size // 8, warp_size, 8).permute(0, 1, 3, 2, 4).reshape(N, K_iter, warp_size, chunk_size)
+    return new_mat
+def vq_pack_reshape_mat_routine(mat, chunk_size, vec_sz, warp_size=32):
+    '''
+        mat: torch.Tensor, shape = (N, K)
+        chunk_size: int
+        return: vecs
+    '''
+    N, K = mat.shape
+    assert K % chunk_size == 0, "K must be divisible by chunk_size"
+    if K % (chunk_size * warp_size) == 0:
+        mat = reshape_mat(mat, chunk_size, warp_size)
+        vecs = mat.reshape(-1, vec_sz)
+    else:
+        full_warp_part = K - K % (chunk_size * warp_size)
+        mat_full = reshape_mat(mat[:, :full_warp_part], chunk_size, warp_size)
+        effective_warp_size = (K % (chunk_size * warp_size)) // chunk_size
+        mat_partial = reshape_mat(mat[:, full_warp_part:], chunk_size, effective_warp_size)
+        vecs = torch.cat([mat_full.reshape(N, -1, vec_sz), mat_partial.reshape(N, -1, vec_sz)], dim=1).reshape(-1, vec_sz)
+    return vecs
+def pack_qweight_vq_simt(P, lut_bits, vec_sz, code_n, codeT_sz=32):
+    '''
+        P: (N, K // vec_sz)
+    '''
+    N = P.shape[0]
+    expanded_P = P.unsqueeze(-1).expand(-1, -1, vec_sz).reshape(N, -1)
+    reshaped_P = vq_pack_reshape_mat_routine(expanded_P, chunk_size=int(32*vec_sz), vec_sz=vec_sz)[:, 0].contiguous()
+    packed_codes = torch.from_numpy(pack_codes(reshaped_P.view(-1).cpu().numpy(), lut_bits, code_n, codeT_sz)).reshape(N, -1).cuda()
+    packed_codes = vq_pack_reshape_pack_routine(packed_codes, 32, code_n, N)
+    return packed_codes
+def pack_qweight_sq_simt(P, lut_bits):
+    '''
+        P: (N, K)
+    '''
+    N = P.shape[0]
+    packed_codes = torch.from_numpy(pack_for_sq_pack_kernel(P.cpu().numpy(), lut_bits)).reshape(N, -1).cuda()
+    return packed_codes
+# for tensor core
+def pack_qweight(P, vec_sz, lut_bits, td_x=16, td_y=16, batch_size=1024):
+    '''
+        P: (N, K // vec_sz, 2 ** lut_bits) 0, 1
+    '''
+    N = P.shape[0]
+    mat_packed = []
+    for i in range(0, N, batch_size):
+        sidx, eidx = i, min(i + batch_size, N)
+        cur_size = eidx - sidx
+        mat_packed.append(pack_qweight_routine(P[sidx:eidx], vec_sz, lut_bits, td_x, td_y))
+    return torch.cat(mat_packed, dim=0)
+def pack_qweight_routine(P, vec_sz, lut_bits, td_x=16, td_y=16):
+    '''
+        P: (N, K // vec_sz, 2 ** lut_bits) 0, 1
+    '''
+    if vec_sz == 1:
+        if len(P.shape) == 3:
+            P_ind = P.argmax(dim=-1) # (N, K)
+        elif len(P.shape) == 2:
+            P_ind = P
+        N, K = P_ind.shape
+        P_tiled = P_ind.reshape(N // td_x, td_x, K // td_y, td_y) \
+                    .permute(0, 2, 1, 3) \
+                    .reshape(-1, td_x * td_y)
+        P_tiled_permuted = P_tiled[..., _PERMUTE]
+    elif vec_sz == 2:
+        if len(P.shape) == 3:
+            P_ind = P.argmax(dim=-1) # (N, K // vec_sz)
+        else:
+            P_ind = P
+        P_ind = P_ind.unsqueeze(-1).expand(-1, -1, vec_sz)
+        P_ind = P_ind.reshape(P_ind.shape[0], -1).contiguous()
+        N, K = P_ind.shape
+        P_tiled = P_ind.reshape(N // td_x, td_x, K // td_y, td_y) \
+                    .permute(0, 2, 1, 3) \
+                    .reshape(-1, td_x * td_y)
+        # permute and flatten
+        P_tiled_permuted = P_tiled[..., _PERMUTE].contiguous().view(-1, vec_sz)
+        assert torch.allclose(P_tiled_permuted[:, 0], P_tiled_permuted[:, 1]), \
+            "P_tiled_permuted[:, 0] and P_tiled_permuted[:, 1] are not the same"
+        P_tiled_permuted = P_tiled_permuted[:, 0].contiguous()
+        P_tiled_permuted = P_tiled_permuted.reshape((N * K) // (td_x * td_y), td_x * td_y // vec_sz)
+    m = P_tiled_permuted.shape[0]
+    c = (td_x * td_y) // vec_sz
+    K_mask = 2 ** torch.arange(lut_bits, device=P.device).view(1, 1, -1)  # => [1,1,lut_bits]
+    bits_bool = (P_tiled_permuted.unsqueeze(-1) & K_mask) > 0   # => [m, c, lut_bits]
+    if vec_sz == 1:
+        # group 4 bytes => 1 uint32
+        # group 8 bits => 1 byte
+        bits_bool_8 = bits_bool.reshape(m, (c * lut_bits) // 8, 8)  # => [m, c*lut_bits/8, 8]
+        uint_mask = (2 ** torch.arange(8, device=P.device, dtype=torch.int16)).view(1, 1, 8)
+        packed_8 = (bits_bool_8.to(torch.int16) * uint_mask).sum(dim=-1).to(torch.uint8)  # => [m, (c*lut_bits)//8]
+        mat_packed = packed_8.reshape(N // td_x // 2, 2, K // td_y // 2, 2, td_x * td_y // 8, lut_bits) \
+                    .permute(0, 2, 4, 3, 1, 5).contiguous().flatten().view(torch.uint32)\
+                    .reshape((N * K) // (td_x * td_y), (td_x * td_y * lut_bits) // (32 * vec_sz))
+    elif vec_sz == 2:
+        # group 8 bits => 1 byte
+        bits_bool_4 = bits_bool.reshape(m, (c * lut_bits) // 4, 4)  # => [m, c*nbits/8, 8]
+        uint_mask = (2 ** torch.arange(4, device=bits_bool_4.device, dtype=torch.int16)).view(1, 1, 4)
+        packed_4 = (bits_bool_4.to(torch.int16) * uint_mask).sum(dim=-1).to(torch.uint8)  # => [m, (c*nbits)//8]
+        mat_packed_48 = packed_4.reshape(N // td_x // 2, 2, K // td_y // 2, 2, td_x * td_y // 8, lut_bits) \
+                    .permute(0, 2, 4, 3, 1, 5).contiguous().flatten()
+        # uint 4 packed in uint 8 to uint 32
+        packing_mask = torch.Tensor([1, 2 ** 4]).to(torch.int8).view(1,2).cuda()
+        mat_packed8 = (mat_packed_48.reshape(-1, 2) * packing_mask).sum(dim=-1).to(torch.uint8).contiguous().flatten()
+        mat_packed = mat_packed8.view(torch.uint32).reshape((N * K) // (td_x * td_y), (td_x * td_y * lut_bits) // (32 * vec_sz))
+    return mat_packed.view(N, -1)
+def load_hessian(in_hess_path, sigma_reg=0.01):
+    H_data = torch.load(in_hess_path, map_location=torch.device('cpu'))
+    H = utils.flat_to_sym(H_data['flatH'], H_data['n'])
+    if 'mu' in H_data:
+        mu = H_data['mu']
+        H += mu[None, :] * mu[:, None]
+        del mu
+    del H_data
+    H = utils.regularize_H(H, sigma_reg)
+    assert len(H.shape) == 2 and H.shape[0] == H.shape[1], "H must be a square matrix"
+    return H.to(torch.float64).unsqueeze(-1)
+def load_group_hessian(in_hess_path, sigma_reg=0.01, layer_key=None):
+    H_data = torch.load(in_hess_path, map_location=torch.device('cpu'))
+    H = H_data[layer_key]
+    for i in range(H.shape[-1]):
+        H[:, :, i] = utils.regularize_H(H[:, :, i], sigma_reg)
+    assert len(H.shape) == 3 and H.shape[0] == H.shape[1], "H must be a square matrix"
+    return H.to(torch.float64)
+# deprecated func for dequantization
+@torch.compile
+def dequantize_mat_sq(mat_packed, lut, N, K, nbits, td_x=16, td_y=16):
+    packed = mat_packed.flatten().view(torch.uint8).reshape(N // td_x // 2,
+                                                         K // td_y // 2,
+                                                         td_x * td_y // 8,
+                                                         2, 2, nbits)
+    packed_8 = packed.permute(0, 4, 1, 3, 2, 5).contiguous().reshape(N * K // (td_x * td_y), (td_x * td_y) * nbits // 8)
+    bits_mask = (2 ** torch.arange(8, device=mat_packed.device, dtype=torch.int16)).view(1, 1, 8)
+    bits_bool_8 = (packed_8.unsqueeze(-1) & bits_mask) > 0
+    bits_bool = bits_bool_8.reshape(N * K // (td_x * td_y), (td_x * td_y), nbits)
+    K_mask = 2 ** torch.arange(nbits, device=mat_packed.device).view(1, 1, -1)
+    indices = (bits_bool * K_mask).sum(dim=-1)
+    recon = lut[indices.long()].reshape(N * K // (td_x * td_y), td_x * td_y)
+    recon = recon.index_select(dim=1, index=_INV_PERMUTE.to(mat_packed.device))
+    recon = recon.reshape(N // td_x, K // td_y, td_x, td_y)
+    recon = recon.permute(0, 2, 1, 3).reshape(N, K)
+    return recon
+@torch.compile
+def dequantize_mat_sq_inds(mat_packed, N, K, nbits, td_x=16, td_y=16):
+    packed = mat_packed.flatten().view(torch.uint8).reshape(N // td_x // 2,
+                                                         K // td_y // 2,
+                                                         td_x * td_y // 8,
+                                                         2, 2, nbits)
+    packed_8 = packed.permute(0, 4, 1, 3, 2, 5).contiguous().reshape(N * K // (td_x * td_y), (td_x * td_y) * nbits // 8)
+    bits_mask = (2 ** torch.arange(8, device=mat_packed.device, dtype=torch.int16)).view(1, 1, 8)
+    bits_bool_8 = (packed_8.unsqueeze(-1) & bits_mask) > 0
+    bits_bool = bits_bool_8.reshape(N * K // (td_x * td_y), (td_x * td_y), nbits)
+    K_mask = 2 ** torch.arange(nbits, device=mat_packed.device).view(1, 1, -1)
+    indices = (bits_bool * K_mask).sum(dim=-1)
+    indices = indices.reshape(N * K // (td_x * td_y), td_x * td_y).index_select(dim=1, index=_INV_PERMUTE.to(mat_packed.device))
+    indices = indices.reshape(N // td_x, K // td_y, td_x, td_y)
+    indices = indices.permute(0, 2, 1, 3).reshape(N, K).contiguous()
+    return indices
+@torch.compile
+def dequantize_mat_sq_inds_vec2(mat_packed: torch.Tensor,
+                                N: int,
+                                K: int,
+                                lut_bits: int,
+                                td_x: int = 16,
+                                td_y: int = 16) -> torch.Tensor:
+    mat_packed8 = mat_packed.view(torch.uint8).flatten().unsqueeze(-1).expand(-1, 2).contiguous()
+    mat_packed48 = torch.zeros_like(mat_packed8)
+    mat_packed48[:, 0] = mat_packed8[:, 0] & 0b1111
+    mat_packed48[:, 1] = mat_packed8[:, 1] >> 4
+    packed_4 = mat_packed48.reshape(N // td_x // 2, K // td_y // 2, td_x * td_y // 8, 2, 2, lut_bits).permute(0,4,1,3,2,5).reshape(N * K // (td_x * td_y), -1).contiguous()
+    bits_mask = (2 ** torch.arange(4, device=mat_packed.device, dtype=torch.int16)).view(1, 1, 4)
+    bits_bool_4 = (packed_4.unsqueeze(-1) & bits_mask) > 0
+    bits_bool = bits_bool_4.reshape(N * K // (td_x * td_y), td_x * td_y // 2, lut_bits)
+    K_mask = 2 ** torch.arange(lut_bits, device=mat_packed.device).view(1, 1, -1)
+    indices = (bits_bool * K_mask).sum(dim=-1)
+    indices = indices.reshape(N * K // (td_x * td_y), td_x * td_y // 2, 1).expand(-1, -1, 2).reshape(N * K // (td_x * td_y), td_x * td_y).contiguous()
+    indices = indices.index_select(dim=1, index=_INV_PERMUTE.to(mat_packed.device))
+    indices = indices.reshape(N // td_x, K // td_y, td_x, td_y)
+    indices = indices.permute(0, 2, 1, 3).reshape(N, K // 2, 2)
+    indices = indices[:, :, 0].contiguous()
+    return indices
+def convert_tensor_core_to_simt(mat_packed, N, K, vec_sz, lut_bit, code_n, codeT_sz=32, td_x=16, td_y=16):
+    device = mat_packed.device
+    mat_packed = mat_packed.cuda()
+    if vec_sz == 2:
+        indices = dequantize_mat_sq_inds_vec2(mat_packed, N, K, lut_bit, td_x, td_y)
+        packed_codes = pack_qweight_vq_simt(indices, lut_bit, vec_sz, code_n, codeT_sz)
+        packed_codes = packed_codes.to(device)
+    else:
+        indices = dequantize_mat_sq_inds(mat_packed, N, K, lut_bit, td_x, td_y)
+        packed_codes = pack_qweight_sq_simt(indices, lut_bit)
+        packed_codes = packed_codes.to(device)
+    return packed_codes.contiguous()
+if __name__ == "__main__":
+    nbits = 5
+    Qidxs = torch.randint(0, 2**nbits, (11008, 4096)).cuda()
+    packed = pack_qweight(Qidxs, 1, nbits)
+    indices = dequantize_mat_sq_inds(packed, 11008, 4096, nbits)
+    converted = convert_tensor_core_to_simt(packed, 11008, 4096, 1, nbits, code_n=nbits)
+    Qidxs2 = torch.randint(0, 2**nbits, (11008, 2048)).cuda()
+    packed2 = pack_qweight(Qidxs2, 2, nbits)
+    indices2 = dequantize_mat_sq_inds_vec2(packed2, 11008, 4096, nbits)
+    converted2 = convert_tensor_core_to_simt(packed2, 11008, 4096, 2, nbits, code_n=nbits)
+    import ipdb; ipdb.set_trace()

lib/quantizer/tcq_quant.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+from lib.utils import block_LDL, matmul_hadUt, matmul_hadUt_head
+from lib.algo.ldlq import LDLQ
+from lib.quantizer.quant_op import load_hessian, load_group_hessian
+from lib.linear import QTIPLinearTCQ, IncoherentLinear
+from lib.codebook.bitshift import bitshift_codebook
+import torch._dynamo
+import time
+class Args:
+    def __init__(self, td_x, td_y, V):
+        self.td_x = td_x
+        self.td_y = td_y
+        self.V = V
+def qtip_quantize_mat(Wr, HRr, Wscale, cb, td_x=16, td_y=16, KV=4, V=2, use_hess=True):
+    HRr_orig = HRr.clone()
+    Wr = Wr.to(torch.float64)
+    (m, n) = Wr.shape
+    gs = HRr.shape[-1]
+    LRrs = []
+    diag = torch.arange(n, device=HRr.device)
+    if not use_hess:
+        eye = torch.eye(n, device=Wr.device, dtype=torch.float64)
+        LRr, D = block_LDL(eye, td_y)
+        LRr[diag, diag] = 0
+        LRrs.append(LRr)
+    else:
+        for i in range(gs):
+            LRr, D = block_LDL(HRr[:,:,i], td_y)
+            LRr[diag, diag] = 0
+            LRrs.append(LRr)
+    args = Args(td_x, td_y, V)
+    Qidxs_list = []
+    hatWr_list = []
+    for i in range(gs):
+        cur_Wr = Wr[m // gs * i:m // gs * (i+1)]
+        hatWr, Qidxs = LDLQ(cur_Wr, LRrs[i], cb.cuda(), args, for_kernel=True)
+        hatWr_list.append(hatWr)
+        Qidxs_list.append(Qidxs)
+    hatWr = torch.cat(hatWr_list, dim=0)
+    Qidxs = torch.cat(Qidxs_list, dim=0)
+    assert hatWr.shape == Wr.shape, f"hatWr.shape {hatWr.shape} != Wr.shape {Wr.shape}"
+    Qidxs = Qidxs.cpu()
+    packed = cb.pack_trellis(
+        Qidxs.reshape(m // td_x, td_x, n // td_y,
+                        td_y // V).transpose(1, 2).reshape(
+                            -1, td_x * td_y // V))
+    packed_8 = packed.view(torch.uint8).view(-1, 2)
+    packed_4 = torch.cat([packed_8.unsqueeze(-1) & (2 ** 4 - 1), (packed_8.unsqueeze(-1) & (2 ** 8 - 2 ** 4)) >> 4], dim=-1).view(-1, 4).flip(
+                (-1, ))
+    packed_4 = packed_4.reshape(m // 16 // 2, 2, n // 16 // 2, 2, 16 * 16 // 8,
+                                KV).permute(0, 2, 4, 3, 1, 5).flip(
+                                    (-1, )).contiguous().flatten()
+    packed_8 = torch.sum(packed_4.view(-1, 2) * torch.Tensor([[1, 2 ** 4]]).to(torch.uint8), dim=-1).to(torch.uint8).contiguous()
+    packed = packed_8.view(torch.int16).reshape(packed.shape).cuda()
+    Wr *= Wscale.reshape(-1, 1)
+    hatWr *= Wscale.reshape(-1, 1)
+    orig_err = (Wr - hatWr).pow(2).mean()
+    err = orig_err / Wr.pow(2).mean()
+    print(
+        f'err {err.item()} orig_err {orig_err.item()}'
+    )
+    quant_info = {
+        "quantizer": "tcq_ldlq",
+        "td_x": td_x,
+        "td_y": td_y,
+        "KV": KV,
+        "V": V,
+        "use_hess": use_hess,
+        "orig_err": orig_err.item(),
+        "err": err.item(),
+    }
+    return packed, hatWr, quant_info
+def inc_linear_to_inc_tcq_linear(inc_linear, HRr, cb, td_x=16, td_y=16, KV=4, V=2, scale_override=0.9, use_hess=True):
+    Wr = inc_linear.linear.weight.data * scale_override
+    Wscale = inc_linear.Wscale.data / scale_override
+    inc_linear.Wscale.data.copy_(Wscale)
+    packed, hatWr, quant_info = qtip_quantize_mat(Wr, HRr, Wscale, cb, td_x=td_x, td_y=td_y, KV=KV, V=V, use_hess=use_hess)
+    out_features, in_features = Wr.shape
+    tcq_linear = QTIPLinearTCQ(
+        in_features,
+        out_features,
+        td_x=16,
+        td_y=16,
+        L=16,
+        KV=KV,
+        V=V,
+        tlut_bits=cb.tlut_bits,
+        bias=inc_linear.bias is not None,
+        dtype=inc_linear.dtype,
+    )
+    tcq_linear.trellis.data.copy_(packed)
+    tcq_linear.tlut.data.copy_(cb.tlut)
+    inc_linear.linear = tcq_linear
+    return inc_linear, quant_info
+def linear_to_incoherent_for_tcq(linear, cb, HR, scale_override=0.9, SU=None, SV=None, lnorm=None, hadU=None, hadV=None, rot_info="all", left_only=False):
+    dtype_ = torch.float32
+    device = linear.weight.device
+    inc_linear = IncoherentLinear(linear.in_features, linear.out_features, hadU, hadV, linear.bias is not None, dtype_)
+    if SU is None:
+        SU = ((torch.randn(linear.in_features, dtype=dtype_) > 0.0) * 2.0 - 1.0).to(device).to(dtype_)
+    if SV is None:
+        SV = ((torch.randn(linear.out_features, dtype=dtype_) > 0.0) * 2.0 - 1.0).to(device).to(dtype_)
+    if left_only:
+        SV = torch.ones_like(SV)
+    if linear.bias is not None:
+        inc_linear.bias.data.copy_(linear.bias)
+    W = linear.weight.data.clone().to(dtype_)
+    Wr = matmul_hadUt_head(matmul_hadUt_head(W.T.to(device) * SV, hadV).T * SU, hadU) if not left_only else matmul_hadUt_head(W * SU, hadU)
+    if left_only:
+        Wscale = Wr.to(torch.float64).square().mean(-1).sqrt().view(-1, 1).to(dtype_) / (cb.lut.to(torch.float64).square().mean().sqrt().float() * scale_override) # (out_features, 1)
+    else:
+        Wscale = Wr.to(torch.float64).square().mean().sqrt().view(-1, 1).to(dtype_) / (cb.lut.to(torch.float64).square().mean().sqrt().float() * scale_override) # (1, 1)
+        Wscale = Wscale.repeat(Wr.shape[0], 1) # (out_features, 1)
+    Wr = Wr / Wscale
+    HRr = torch.zeros_like(HR)
+    for i in range(HR.shape[-1]):
+        HRr[:,:,i] = matmul_hadUt_head(matmul_hadUt_head(HR[:,:,i].to(device).contiguous() * (1./ SU), hadU).T * (1./ SU), hadU)
+    inc_linear.SU.data.copy_(1./SU.to(dtype_))
+    inc_linear.SV.data.copy_(1./SV.to(dtype_))
+    inc_linear.Wscale.data.copy_(Wscale.view(-1))
+    inc_linear.linear.weight.data.copy_(Wr.to(dtype_))
+    inc_linear.rot_info = rot_info
+    inc_linear.apply_rot_info()
+    return inc_linear, HRr
+def linear_to_tcq_linear(target_layer, hess_path, cb, scale_override=0.9, KV=4, V=2, use_hess=True, SU=None, SV=None, lnorm=None, hadU=None, hadV=None, rot_info="all", left_only=False, ghess_key=""):
+    t0 = time.time()
+    out_features, in_features = target_layer.weight.shape
+    if ghess_key == "":
+        HR = load_hessian(hess_path).cuda() if hess_path is not None else torch.eye(in_features, device="cuda", dtype=torch.float64).unsqueeze(-1)
+    else:
+        HR = load_group_hessian(hess_path, layer_key=ghess_key).cuda()
+    layer, HRr = linear_to_incoherent_for_tcq(target_layer, cb, HR, scale_override, SU=SU, SV=SV, lnorm=lnorm, hadU=hadU, hadV=hadV, rot_info=rot_info, left_only=left_only)
+    HRr = HRr.cuda()
+    layer = layer.cuda()
+    layer, quant_info = inc_linear_to_inc_tcq_linear(layer, HRr, cb, scale_override=1.0, td_x=16, td_y=16, KV=KV, V=V, use_hess=use_hess)
+    quant_info["scale_override"] = scale_override
+    quant_info["hess_path"] = hess_path
+    quant_info["time"] = time.time() - t0
+    return layer.to(torch.float16), quant_info

lib/quantizer/vq_quant.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import torch
+from lib.quantizer.quant_op import _INV_PERMUTE, load_hessian, load_group_hessian
+from lib.linear import IncoherentLinear
+from lib.utils import matmul_hadUt, matmul_hadUt_head, clean
+from lib.utils.kmeans import kmeans_sklearn, kmeans_flash1d
+from lib.quantizer.nuq_op import train_least_squares
+from lib.quantizer.quant_op import pack_qweight
+from lib.linear import VQLinearPackTensorCore
+import random
+import time
+def simple_vq(Wr, vec_sz, lut_bits, batch_size=256):
+    batch_size = 64 if lut_bits >= 12 else batch_size
+    # kmeans
+    Wr_flatten = Wr.reshape(-1, vec_sz)
+    if vec_sz == 1:
+        init_centroids = kmeans_flash1d(Wr_flatten, 2 ** lut_bits)
+    else:
+        init_centroids = kmeans_sklearn(Wr_flatten, 2 ** lut_bits)
+    Wr_vec = Wr.reshape(Wr.shape[0], -1, vec_sz) # (W_row, W_col // vec_sz, vec_sz)
+    min_indices = torch.zeros(Wr.shape[0], Wr.shape[1] // vec_sz).to(Wr.device).long()
+    for s_idx in range(0, Wr.shape[0], batch_size):
+        e_idx = min(s_idx + batch_size, Wr.shape[0])
+        dist_sq = ((Wr_vec[s_idx:e_idx].unsqueeze(2) - init_centroids.unsqueeze(0).unsqueeze(0)) ** 2).sum(dim=-1)
+        idx = dist_sq.argmin(dim=-1) # batch_size, W_col // vec_sz
+        min_indices[s_idx:e_idx] = idx
+    init_P = torch.zeros(Wr.shape[0], Wr.shape[1] // vec_sz, 2 ** lut_bits, dtype=torch.uint8).to(Wr.device)
+    init_P.scatter_(2, min_indices.unsqueeze(-1), 1)
+    return init_P, init_centroids.to(torch.float32)
+def vq_quantize_mat(Wr, HRr, Wscale, vec_sz, lut_bits, iterations=6, use_hess=True):
+    Wr = Wr.to(torch.float64)
+    if use_hess:
+        assert len(HRr.shape) == 3, "HRr must be a 3D tensor"
+        assert HRr.shape[0] == HRr.shape[1], "HRr must be a square matrix"
+        init_P, init_centroids = simple_vq(Wr, vec_sz, lut_bits)
+        P, C, log_dict = train_least_squares(
+            W=Wr.detach().cpu().numpy(),
+            init_P=init_P.detach().cpu().to(torch.float32).numpy(),
+            init_centroids=init_centroids.detach().cpu().numpy(),
+            H=HRr.permute(2, 0, 1).detach().cpu().numpy(),
+            num_iterations=iterations,
+        )
+    else:
+        init_P, init_centroids = simple_vq(Wr, vec_sz, lut_bits)
+        P, C = init_P, init_centroids
+    P = P.to(Wr.device)
+    C = C.to(Wr.device)
+    P_ind = torch.argmax(P, dim=-1)
+    hatWr = C[P_ind]
+    hatWr = hatWr.view(hatWr.shape[0], -1)
+    Wr *= Wscale.view(-1, 1)
+    hatWr *= Wscale.view(-1, 1)
+    orig_err = (Wr - hatWr).pow(2).mean()
+    err = (Wr - hatWr).pow(2).mean() / (Wr.pow(2).mean())
+    print(
+        f'err {err.item()} orig_err {orig_err.item()}'
+    )
+    quant_info = {
+        "quantizer": "vq_lnq",
+        "vec_sz": vec_sz,
+        "lut_bits": lut_bits,
+        "use_hess": use_hess,
+        "iterations": iterations,
+        "orig_err": orig_err.item(),
+        "err": err.item(),
+    }
+    # pack P appropriately to kernel
+    packed = pack_qweight(P, vec_sz, lut_bits)
+    return packed, C, hatWr, quant_info
+def inc_linear_to_inc_vq_linear(inc_linear, HRr, lut_bits=4, vec_sz=2, scale_override=0.9, use_hess=True):
+    Wr = inc_linear.linear.weight.data * scale_override
+    Wscale = inc_linear.Wscale.data / scale_override
+    inc_linear.Wscale.data.copy_(Wscale)
+    packed, C, hatWr, quant_info = vq_quantize_mat(Wr, HRr, Wscale, vec_sz, lut_bits, use_hess=use_hess)
+    out_features, in_features = Wr.shape
+    sq_linear = VQLinearPackTensorCore(
+        in_features,
+        out_features,
+        lut_bits=lut_bits,
+        vec_sz=vec_sz,
+        bias=inc_linear.bias is not None,
+        dtype=inc_linear.dtype,
+    )
+    sq_linear.qweight.data.copy_(packed)
+    sq_linear.lut.data.copy_(C.view(2 ** lut_bits, vec_sz))
+    inc_linear.linear = sq_linear
+    return inc_linear, quant_info
+def linear_to_incoherent_for_vq(linear, HR, scale_override=0.9, SU=None, SV=None, lnorm=None, hadU=None, hadV=None, rot_info="all", left_only=False):
+    dtype_ = torch.float32
+    device = linear.weight.device
+    inc_linear = IncoherentLinear(linear.in_features, linear.out_features, hadU, hadV, linear.bias is not None, dtype_)
+    if SU is None:
+        SU = ((torch.randn(linear.in_features, dtype=dtype_) > 0.0) * 2.0 - 1.0).to(device).to(dtype_)
+    if SV is None:
+        SV = ((torch.randn(linear.out_features, dtype=dtype_) > 0.0) * 2.0 - 1.0).to(device).to(dtype_)
+    if left_only:
+        SV = torch.ones_like(SV)
+    if linear.bias is not None:
+        inc_linear.bias.data.copy_(linear.bias)
+    W = linear.weight.data.to(dtype_)
+    Wr = matmul_hadUt_head(matmul_hadUt_head(W.T.to(device) * SV, hadV).T * SU, hadU) if not left_only else matmul_hadUt_head(W * SU, hadU)
+    Wscale = Wr.to(torch.float64).square().mean(-1).sqrt().view(-1, 1).to(dtype_) / scale_override
+    Wr = Wr / Wscale
+    HRr = torch.zeros_like(HR)
+    for i in range(HR.shape[-1]):
+        HRr[:,:,i] = matmul_hadUt_head(matmul_hadUt_head(HR[:,:,i].to(device).contiguous() * (1./ SU), hadU).T * (1./ SU), hadU)
+    inc_linear.SU.data.copy_(1./SU.to(dtype_))
+    inc_linear.SV.data.copy_((1./SV).to(dtype_))
+    inc_linear.Wscale.data.copy_(Wscale.view(-1))
+    inc_linear.linear.weight.data.copy_(Wr.to(dtype_))
+    inc_linear.rot_info = rot_info
+    inc_linear.apply_rot_info()
+    return inc_linear, HRr
+def linear_to_vq_linear(target_layer, hess_path, scale_override=0.9, lut_bits=4, vec_sz=1, use_hess=True, SU=None, SV=None, lnorm=None, hadU=None, hadV=None, rot_info="all", left_only=False, ghess_key=""):
+    t0 = time.time()
+    out_features, in_features = target_layer.weight.shape
+    if ghess_key == "":
+        HR = load_hessian(hess_path).cuda() if hess_path is not None else torch.eye(in_features, device="cuda", dtype=torch.float64).unsqueeze(-1)
+    else:
+        HR = load_group_hessian(hess_path, layer_key=ghess_key).cuda()
+    layer, HRr = linear_to_incoherent_for_vq(target_layer, HR, scale_override, SU=SU, SV=SV, lnorm=lnorm, hadU=hadU, hadV=hadV, rot_info=rot_info, left_only=left_only)
+    HRr = HRr.cuda()
+    layer = layer.cuda()
+    layer, quant_info = inc_linear_to_inc_vq_linear(layer, HRr, scale_override=1.0, lut_bits=lut_bits, vec_sz=vec_sz, use_hess=use_hess)
+    quant_info["scale_override"] = scale_override
+    quant_info["hess_path"] = hess_path
+    quant_info["time"] = time.time() - t0
+    print("elapsed time", time.time() - t0)
+    return layer.to(torch.float16), quant_info