diff --git "a/.cache/MatMulNBits_2_0_meta.json" "b/.cache/MatMulNBits_2_0_meta.json"
new file mode 100644--- /dev/null
+++ "b/.cache/MatMulNBits_2_0_meta.json"
@@ -0,0 +1,36347 @@
+{
+  "dd_meta_major_version": 1,
+  "dd_meta_minor_version": 4,
+  "state_table_updates": [
+    {
+      "state_table_idx": 0,
+      "update_func": 1,
+      "update_arg": 1
+    }
+  ],
+  "op_list": [
+    {
+      "name": "MatMulNBits_2_0",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/input_layernorm/output_0.out5_4_0"
+      ],
+      "const_args": [
+        "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.0.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/input_layernorm/output_0.out5_4_0"
+      ],
+      "const_args": [
+        "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.0.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "3",
+            "1"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.0/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0",
+        "past_key_values.0.key",
+        "past_key_values.0.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0",
+        "present.0.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "0",
+            "0",
+            "2",
+            "0",
+            "1",
+            "1",
+            "6",
+            "0",
+            "2",
+            "0"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.0.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0"
+      ],
+      "const_args": [
+        "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_0",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/embed_tokens/Gather/output_0.out4_0",
+        "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1"
+      ],
+      "const_args": [
+        "model.layers.0.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.0/post_attention_layernorm/output_3.out4_0",
+        "/model/layers.0/post_attention_layernorm/output_0.out4_0"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_0",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.0/post_attention_layernorm/output_0.out4_0"
+      ],
+      "const_args": [
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.0/mlp/Mul/output_0.out3_0"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.0.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/mlp/Mul/output_0.out3_0"
+      ],
+      "const_args": [
+        "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_1",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.0/post_attention_layernorm/output_3.out4_0",
+        "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2"
+      ],
+      "const_args": [
+        "model.layers.1.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.1/input_layernorm/output_3.out4_1",
+        "/model/layers.1/input_layernorm/output_0.out4_1"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_1",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/input_layernorm/output_0.out4_1"
+      ],
+      "const_args": [
+        "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.1.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/input_layernorm/output_0.out4_1"
+      ],
+      "const_args": [
+        "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.1.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "7",
+            "3"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.1/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3",
+        "past_key_values.1.key",
+        "past_key_values.1.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1",
+        "present.1.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "4",
+            "2",
+            "2",
+            "0",
+            "5",
+            "3",
+            "6",
+            "0",
+            "6",
+            "2"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.1.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1"
+      ],
+      "const_args": [
+        "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_2",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.1/input_layernorm/output_3.out4_1",
+        "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4"
+      ],
+      "const_args": [
+        "model.layers.1.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.1/post_attention_layernorm/output_3.out4_2",
+        "/model/layers.1/post_attention_layernorm/output_0.out4_2"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_1",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.1/post_attention_layernorm/output_0.out4_2"
+      ],
+      "const_args": [
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.1/mlp/Mul/output_0.out3_1"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.1.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/mlp/Mul/output_0.out3_1"
+      ],
+      "const_args": [
+        "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_3",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.1/post_attention_layernorm/output_3.out4_2",
+        "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5"
+      ],
+      "const_args": [
+        "model.layers.2.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.2/input_layernorm/output_3.out4_3",
+        "/model/layers.2/input_layernorm/output_0.out4_3"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_2",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/input_layernorm/output_0.out4_3"
+      ],
+      "const_args": [
+        "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.2.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/input_layernorm/output_0.out4_3"
+      ],
+      "const_args": [
+        "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.2.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "11",
+            "5"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.2/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6",
+        "past_key_values.2.key",
+        "past_key_values.2.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2",
+        "present.2.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "8",
+            "4",
+            "2",
+            "0",
+            "9",
+            "5",
+            "6",
+            "0",
+            "10",
+            "4"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.2.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2"
+      ],
+      "const_args": [
+        "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_4",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.2/input_layernorm/output_3.out4_3",
+        "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7"
+      ],
+      "const_args": [
+        "model.layers.2.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.2/post_attention_layernorm/output_3.out4_4",
+        "/model/layers.2/post_attention_layernorm/output_0.out4_4"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_2",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.2/post_attention_layernorm/output_0.out4_4"
+      ],
+      "const_args": [
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.2/mlp/Mul/output_0.out3_2"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.2.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/mlp/Mul/output_0.out3_2"
+      ],
+      "const_args": [
+        "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_5",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.2/post_attention_layernorm/output_3.out4_4",
+        "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8"
+      ],
+      "const_args": [
+        "model.layers.3.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.3/input_layernorm/output_3.out4_5",
+        "/model/layers.3/input_layernorm/output_0.out4_5"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_3",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/input_layernorm/output_0.out4_5"
+      ],
+      "const_args": [
+        "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.3.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/input_layernorm/output_0.out4_5"
+      ],
+      "const_args": [
+        "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.3.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "15",
+            "7"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.3/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9",
+        "past_key_values.3.key",
+        "past_key_values.3.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3",
+        "present.3.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "12",
+            "6",
+            "2",
+            "0",
+            "13",
+            "7",
+            "6",
+            "0",
+            "14",
+            "6"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.3.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3"
+      ],
+      "const_args": [
+        "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_6",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.3/input_layernorm/output_3.out4_5",
+        "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10"
+      ],
+      "const_args": [
+        "model.layers.3.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.3/post_attention_layernorm/output_3.out4_6",
+        "/model/layers.3/post_attention_layernorm/output_0.out4_6"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_3",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.3/post_attention_layernorm/output_0.out4_6"
+      ],
+      "const_args": [
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.3/mlp/Mul/output_0.out3_3"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.3.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/mlp/Mul/output_0.out3_3"
+      ],
+      "const_args": [
+        "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_7",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.3/post_attention_layernorm/output_3.out4_6",
+        "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11"
+      ],
+      "const_args": [
+        "model.layers.4.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.4/input_layernorm/output_3.out4_7",
+        "/model/layers.4/input_layernorm/output_0.out4_7"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_4",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/input_layernorm/output_0.out4_7"
+      ],
+      "const_args": [
+        "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.4.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/input_layernorm/output_0.out4_7"
+      ],
+      "const_args": [
+        "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.4.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "19",
+            "9"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.4/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12",
+        "past_key_values.4.key",
+        "past_key_values.4.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4",
+        "present.4.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "16",
+            "8",
+            "2",
+            "0",
+            "17",
+            "9",
+            "6",
+            "0",
+            "18",
+            "8"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.4.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4"
+      ],
+      "const_args": [
+        "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_8",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.4/input_layernorm/output_3.out4_7",
+        "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13"
+      ],
+      "const_args": [
+        "model.layers.4.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.4/post_attention_layernorm/output_3.out4_8",
+        "/model/layers.4/post_attention_layernorm/output_0.out4_8"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_4",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.4/post_attention_layernorm/output_0.out4_8"
+      ],
+      "const_args": [
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.4/mlp/Mul/output_0.out3_4"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.4.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/mlp/Mul/output_0.out3_4"
+      ],
+      "const_args": [
+        "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_9",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.4/post_attention_layernorm/output_3.out4_8",
+        "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14"
+      ],
+      "const_args": [
+        "model.layers.5.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.5/input_layernorm/output_3.out4_9",
+        "/model/layers.5/input_layernorm/output_0.out4_9"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_5",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/input_layernorm/output_0.out4_9"
+      ],
+      "const_args": [
+        "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.5.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/input_layernorm/output_0.out4_9"
+      ],
+      "const_args": [
+        "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.5.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "23",
+            "11"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.5/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15",
+        "past_key_values.5.key",
+        "past_key_values.5.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5",
+        "present.5.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "20",
+            "10",
+            "2",
+            "0",
+            "21",
+            "11",
+            "6",
+            "0",
+            "22",
+            "10"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.5.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5"
+      ],
+      "const_args": [
+        "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_10",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.5/input_layernorm/output_3.out4_9",
+        "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16"
+      ],
+      "const_args": [
+        "model.layers.5.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.5/post_attention_layernorm/output_3.out4_10",
+        "/model/layers.5/post_attention_layernorm/output_0.out4_10"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_5",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.5/post_attention_layernorm/output_0.out4_10"
+      ],
+      "const_args": [
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.5/mlp/Mul/output_0.out3_5"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.5.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/mlp/Mul/output_0.out3_5"
+      ],
+      "const_args": [
+        "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_11",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.5/post_attention_layernorm/output_3.out4_10",
+        "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17"
+      ],
+      "const_args": [
+        "model.layers.6.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.6/input_layernorm/output_3.out4_11",
+        "/model/layers.6/input_layernorm/output_0.out4_11"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_6",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/input_layernorm/output_0.out4_11"
+      ],
+      "const_args": [
+        "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.6.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/input_layernorm/output_0.out4_11"
+      ],
+      "const_args": [
+        "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.6.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "27",
+            "13"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.6/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18",
+        "past_key_values.6.key",
+        "past_key_values.6.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6",
+        "present.6.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "24",
+            "12",
+            "2",
+            "0",
+            "25",
+            "13",
+            "6",
+            "0",
+            "26",
+            "12"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.6.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6"
+      ],
+      "const_args": [
+        "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_12",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.6/input_layernorm/output_3.out4_11",
+        "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19"
+      ],
+      "const_args": [
+        "model.layers.6.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.6/post_attention_layernorm/output_3.out4_12",
+        "/model/layers.6/post_attention_layernorm/output_0.out4_12"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_6",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.6/post_attention_layernorm/output_0.out4_12"
+      ],
+      "const_args": [
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.6/mlp/Mul/output_0.out3_6"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.6.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/mlp/Mul/output_0.out3_6"
+      ],
+      "const_args": [
+        "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_13",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.6/post_attention_layernorm/output_3.out4_12",
+        "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20"
+      ],
+      "const_args": [
+        "model.layers.7.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.7/input_layernorm/output_3.out4_13",
+        "/model/layers.7/input_layernorm/output_0.out4_13"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_7",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/input_layernorm/output_0.out4_13"
+      ],
+      "const_args": [
+        "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.7.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/input_layernorm/output_0.out4_13"
+      ],
+      "const_args": [
+        "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.7.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "31",
+            "15"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.7/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21",
+        "past_key_values.7.key",
+        "past_key_values.7.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7",
+        "present.7.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "28",
+            "14",
+            "2",
+            "0",
+            "29",
+            "15",
+            "6",
+            "0",
+            "30",
+            "14"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.7.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7"
+      ],
+      "const_args": [
+        "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_14",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.7/input_layernorm/output_3.out4_13",
+        "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22"
+      ],
+      "const_args": [
+        "model.layers.7.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.7/post_attention_layernorm/output_3.out4_14",
+        "/model/layers.7/post_attention_layernorm/output_0.out4_14"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_7",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.7/post_attention_layernorm/output_0.out4_14"
+      ],
+      "const_args": [
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.7/mlp/Mul/output_0.out3_7"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.7.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/mlp/Mul/output_0.out3_7"
+      ],
+      "const_args": [
+        "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_15",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.7/post_attention_layernorm/output_3.out4_14",
+        "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23"
+      ],
+      "const_args": [
+        "model.layers.8.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.8/input_layernorm/output_3.out4_15",
+        "/model/layers.8/input_layernorm/output_0.out4_15"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_8",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/input_layernorm/output_0.out4_15"
+      ],
+      "const_args": [
+        "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.8.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/input_layernorm/output_0.out4_15"
+      ],
+      "const_args": [
+        "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.8.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "35",
+            "17"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.8/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24",
+        "past_key_values.8.key",
+        "past_key_values.8.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8",
+        "present.8.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "32",
+            "16",
+            "2",
+            "0",
+            "33",
+            "17",
+            "6",
+            "0",
+            "34",
+            "16"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.8.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8"
+      ],
+      "const_args": [
+        "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_16",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.8/input_layernorm/output_3.out4_15",
+        "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25"
+      ],
+      "const_args": [
+        "model.layers.8.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.8/post_attention_layernorm/output_3.out4_16",
+        "/model/layers.8/post_attention_layernorm/output_0.out4_16"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_8",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.8/post_attention_layernorm/output_0.out4_16"
+      ],
+      "const_args": [
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.8/mlp/Mul/output_0.out3_8"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.8.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/mlp/Mul/output_0.out3_8"
+      ],
+      "const_args": [
+        "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_17",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.8/post_attention_layernorm/output_3.out4_16",
+        "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26"
+      ],
+      "const_args": [
+        "model.layers.9.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.9/input_layernorm/output_3.out4_17",
+        "/model/layers.9/input_layernorm/output_0.out4_17"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_9",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/input_layernorm/output_0.out4_17"
+      ],
+      "const_args": [
+        "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.9.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/input_layernorm/output_0.out4_17"
+      ],
+      "const_args": [
+        "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.9.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "39",
+            "19"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.9/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27",
+        "past_key_values.9.key",
+        "past_key_values.9.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9",
+        "present.9.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "36",
+            "18",
+            "2",
+            "0",
+            "37",
+            "19",
+            "6",
+            "0",
+            "38",
+            "18"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.9.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9"
+      ],
+      "const_args": [
+        "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_18",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.9/input_layernorm/output_3.out4_17",
+        "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28"
+      ],
+      "const_args": [
+        "model.layers.9.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.9/post_attention_layernorm/output_3.out4_18",
+        "/model/layers.9/post_attention_layernorm/output_0.out4_18"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_9",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.9/post_attention_layernorm/output_0.out4_18"
+      ],
+      "const_args": [
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.9/mlp/Mul/output_0.out3_9"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.9.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/mlp/Mul/output_0.out3_9"
+      ],
+      "const_args": [
+        "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_19",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.9/post_attention_layernorm/output_3.out4_18",
+        "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29"
+      ],
+      "const_args": [
+        "model.layers.10.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.10/input_layernorm/output_3.out4_19",
+        "/model/layers.10/input_layernorm/output_0.out4_19"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_10",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/input_layernorm/output_0.out4_19"
+      ],
+      "const_args": [
+        "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.10.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/input_layernorm/output_0.out4_19"
+      ],
+      "const_args": [
+        "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.10.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "43",
+            "21"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.10/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30",
+        "past_key_values.10.key",
+        "past_key_values.10.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10",
+        "present.10.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "40",
+            "20",
+            "2",
+            "0",
+            "41",
+            "21",
+            "6",
+            "0",
+            "42",
+            "20"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.10.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10"
+      ],
+      "const_args": [
+        "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_20",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.10/input_layernorm/output_3.out4_19",
+        "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31"
+      ],
+      "const_args": [
+        "model.layers.10.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.10/post_attention_layernorm/output_3.out4_20",
+        "/model/layers.10/post_attention_layernorm/output_0.out4_20"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_10",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.10/post_attention_layernorm/output_0.out4_20"
+      ],
+      "const_args": [
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.10/mlp/Mul/output_0.out3_10"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.10.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/mlp/Mul/output_0.out3_10"
+      ],
+      "const_args": [
+        "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_21",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.10/post_attention_layernorm/output_3.out4_20",
+        "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32"
+      ],
+      "const_args": [
+        "model.layers.11.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.11/input_layernorm/output_3.out4_21",
+        "/model/layers.11/input_layernorm/output_0.out4_21"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_11",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/input_layernorm/output_0.out4_21"
+      ],
+      "const_args": [
+        "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.11.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/input_layernorm/output_0.out4_21"
+      ],
+      "const_args": [
+        "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.11.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "47",
+            "23"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.11/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33",
+        "past_key_values.11.key",
+        "past_key_values.11.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11",
+        "present.11.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "44",
+            "22",
+            "2",
+            "0",
+            "45",
+            "23",
+            "6",
+            "0",
+            "46",
+            "22"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.11.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11"
+      ],
+      "const_args": [
+        "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_22",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.11/input_layernorm/output_3.out4_21",
+        "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34"
+      ],
+      "const_args": [
+        "model.layers.11.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.11/post_attention_layernorm/output_3.out4_22",
+        "/model/layers.11/post_attention_layernorm/output_0.out4_22"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_11",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.11/post_attention_layernorm/output_0.out4_22"
+      ],
+      "const_args": [
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.11/mlp/Mul/output_0.out3_11"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.11.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/mlp/Mul/output_0.out3_11"
+      ],
+      "const_args": [
+        "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_23",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.11/post_attention_layernorm/output_3.out4_22",
+        "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35"
+      ],
+      "const_args": [
+        "model.layers.12.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.12/input_layernorm/output_3.out4_23",
+        "/model/layers.12/input_layernorm/output_0.out4_23"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_12",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/input_layernorm/output_0.out4_23"
+      ],
+      "const_args": [
+        "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.12.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/input_layernorm/output_0.out4_23"
+      ],
+      "const_args": [
+        "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.12.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "51",
+            "25"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.12/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36",
+        "past_key_values.12.key",
+        "past_key_values.12.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12",
+        "present.12.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "48",
+            "24",
+            "2",
+            "0",
+            "49",
+            "25",
+            "6",
+            "0",
+            "50",
+            "24"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.12.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12"
+      ],
+      "const_args": [
+        "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_24",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.12/input_layernorm/output_3.out4_23",
+        "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37"
+      ],
+      "const_args": [
+        "model.layers.12.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.12/post_attention_layernorm/output_3.out4_24",
+        "/model/layers.12/post_attention_layernorm/output_0.out4_24"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_12",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.12/post_attention_layernorm/output_0.out4_24"
+      ],
+      "const_args": [
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.12/mlp/Mul/output_0.out3_12"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.12.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/mlp/Mul/output_0.out3_12"
+      ],
+      "const_args": [
+        "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_25",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.12/post_attention_layernorm/output_3.out4_24",
+        "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38"
+      ],
+      "const_args": [
+        "model.layers.13.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.13/input_layernorm/output_3.out4_25",
+        "/model/layers.13/input_layernorm/output_0.out4_25"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_13",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/input_layernorm/output_0.out4_25"
+      ],
+      "const_args": [
+        "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.13.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/input_layernorm/output_0.out4_25"
+      ],
+      "const_args": [
+        "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.13.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "55",
+            "27"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.13/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39",
+        "past_key_values.13.key",
+        "past_key_values.13.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13",
+        "present.13.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "52",
+            "26",
+            "2",
+            "0",
+            "53",
+            "27",
+            "6",
+            "0",
+            "54",
+            "26"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.13.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13"
+      ],
+      "const_args": [
+        "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_26",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.13/input_layernorm/output_3.out4_25",
+        "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40"
+      ],
+      "const_args": [
+        "model.layers.13.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.13/post_attention_layernorm/output_3.out4_26",
+        "/model/layers.13/post_attention_layernorm/output_0.out4_26"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_13",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.13/post_attention_layernorm/output_0.out4_26"
+      ],
+      "const_args": [
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.13/mlp/Mul/output_0.out3_13"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.13.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/mlp/Mul/output_0.out3_13"
+      ],
+      "const_args": [
+        "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_27",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.13/post_attention_layernorm/output_3.out4_26",
+        "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41"
+      ],
+      "const_args": [
+        "model.layers.14.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.14/input_layernorm/output_3.out4_27",
+        "/model/layers.14/input_layernorm/output_0.out4_27"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_14",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/input_layernorm/output_0.out4_27"
+      ],
+      "const_args": [
+        "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.14.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/input_layernorm/output_0.out4_27"
+      ],
+      "const_args": [
+        "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.14.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "59",
+            "29"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.14/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42",
+        "past_key_values.14.key",
+        "past_key_values.14.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14",
+        "present.14.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "56",
+            "28",
+            "2",
+            "0",
+            "57",
+            "29",
+            "6",
+            "0",
+            "58",
+            "28"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.14.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14"
+      ],
+      "const_args": [
+        "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_28",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.14/input_layernorm/output_3.out4_27",
+        "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43"
+      ],
+      "const_args": [
+        "model.layers.14.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.14/post_attention_layernorm/output_3.out4_28",
+        "/model/layers.14/post_attention_layernorm/output_0.out4_28"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_14",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.14/post_attention_layernorm/output_0.out4_28"
+      ],
+      "const_args": [
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.14/mlp/Mul/output_0.out3_14"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.14.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/mlp/Mul/output_0.out3_14"
+      ],
+      "const_args": [
+        "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_29",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.14/post_attention_layernorm/output_3.out4_28",
+        "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44"
+      ],
+      "const_args": [
+        "model.layers.15.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.15/input_layernorm/output_3.out4_29",
+        "/model/layers.15/input_layernorm/output_0.out4_29"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_15",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/input_layernorm/output_0.out4_29"
+      ],
+      "const_args": [
+        "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.15.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/input_layernorm/output_0.out4_29"
+      ],
+      "const_args": [
+        "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.15.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "63",
+            "31"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.15/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45",
+        "past_key_values.15.key",
+        "past_key_values.15.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15",
+        "present.15.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "60",
+            "30",
+            "2",
+            "0",
+            "61",
+            "31",
+            "6",
+            "0",
+            "62",
+            "30"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.15.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15"
+      ],
+      "const_args": [
+        "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_30",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.15/input_layernorm/output_3.out4_29",
+        "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46"
+      ],
+      "const_args": [
+        "model.layers.15.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.15/post_attention_layernorm/output_3.out4_30",
+        "/model/layers.15/post_attention_layernorm/output_0.out4_30"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_15",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.15/post_attention_layernorm/output_0.out4_30"
+      ],
+      "const_args": [
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.15/mlp/Mul/output_0.out3_15"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.15.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/mlp/Mul/output_0.out3_15"
+      ],
+      "const_args": [
+        "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_31",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.15/post_attention_layernorm/output_3.out4_30",
+        "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47"
+      ],
+      "const_args": [
+        "model.layers.16.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.16/input_layernorm/output_3.out4_31",
+        "/model/layers.16/input_layernorm/output_0.out4_31"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_16",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/input_layernorm/output_0.out4_31"
+      ],
+      "const_args": [
+        "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.16.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/input_layernorm/output_0.out4_31"
+      ],
+      "const_args": [
+        "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.16.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "67",
+            "33"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.16/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48",
+        "past_key_values.16.key",
+        "past_key_values.16.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16",
+        "present.16.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "64",
+            "32",
+            "2",
+            "0",
+            "65",
+            "33",
+            "6",
+            "0",
+            "66",
+            "32"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.16.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16"
+      ],
+      "const_args": [
+        "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_32",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.16/input_layernorm/output_3.out4_31",
+        "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49"
+      ],
+      "const_args": [
+        "model.layers.16.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.16/post_attention_layernorm/output_3.out4_32",
+        "/model/layers.16/post_attention_layernorm/output_0.out4_32"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_16",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.16/post_attention_layernorm/output_0.out4_32"
+      ],
+      "const_args": [
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.16/mlp/Mul/output_0.out3_16"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.16.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/mlp/Mul/output_0.out3_16"
+      ],
+      "const_args": [
+        "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_33",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.16/post_attention_layernorm/output_3.out4_32",
+        "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50"
+      ],
+      "const_args": [
+        "model.layers.17.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.17/input_layernorm/output_3.out4_33",
+        "/model/layers.17/input_layernorm/output_0.out4_33"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_17",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/input_layernorm/output_0.out4_33"
+      ],
+      "const_args": [
+        "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.17.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/input_layernorm/output_0.out4_33"
+      ],
+      "const_args": [
+        "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.17.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "71",
+            "35"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.17/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51",
+        "past_key_values.17.key",
+        "past_key_values.17.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17",
+        "present.17.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "68",
+            "34",
+            "2",
+            "0",
+            "69",
+            "35",
+            "6",
+            "0",
+            "70",
+            "34"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.17.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17"
+      ],
+      "const_args": [
+        "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_34",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.17/input_layernorm/output_3.out4_33",
+        "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52"
+      ],
+      "const_args": [
+        "model.layers.17.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.17/post_attention_layernorm/output_3.out4_34",
+        "/model/layers.17/post_attention_layernorm/output_0.out4_34"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_17",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.17/post_attention_layernorm/output_0.out4_34"
+      ],
+      "const_args": [
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.17/mlp/Mul/output_0.out3_17"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.17.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/mlp/Mul/output_0.out3_17"
+      ],
+      "const_args": [
+        "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_35",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.17/post_attention_layernorm/output_3.out4_34",
+        "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53"
+      ],
+      "const_args": [
+        "model.layers.18.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.18/input_layernorm/output_3.out4_35",
+        "/model/layers.18/input_layernorm/output_0.out4_35"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_18",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/input_layernorm/output_0.out4_35"
+      ],
+      "const_args": [
+        "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.18.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/input_layernorm/output_0.out4_35"
+      ],
+      "const_args": [
+        "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.18.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "75",
+            "37"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.18/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54",
+        "past_key_values.18.key",
+        "past_key_values.18.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18",
+        "present.18.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "72",
+            "36",
+            "2",
+            "0",
+            "73",
+            "37",
+            "6",
+            "0",
+            "74",
+            "36"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.18.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18"
+      ],
+      "const_args": [
+        "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_36",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.18/input_layernorm/output_3.out4_35",
+        "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55"
+      ],
+      "const_args": [
+        "model.layers.18.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.18/post_attention_layernorm/output_3.out4_36",
+        "/model/layers.18/post_attention_layernorm/output_0.out4_36"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_18",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.18/post_attention_layernorm/output_0.out4_36"
+      ],
+      "const_args": [
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.18/mlp/Mul/output_0.out3_18"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.18.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/mlp/Mul/output_0.out3_18"
+      ],
+      "const_args": [
+        "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_37",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.18/post_attention_layernorm/output_3.out4_36",
+        "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56"
+      ],
+      "const_args": [
+        "model.layers.19.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.19/input_layernorm/output_3.out4_37",
+        "/model/layers.19/input_layernorm/output_0.out4_37"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_19",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/input_layernorm/output_0.out4_37"
+      ],
+      "const_args": [
+        "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.19.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/input_layernorm/output_0.out4_37"
+      ],
+      "const_args": [
+        "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.19.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "79",
+            "39"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.19/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57",
+        "past_key_values.19.key",
+        "past_key_values.19.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19",
+        "present.19.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "76",
+            "38",
+            "2",
+            "0",
+            "77",
+            "39",
+            "6",
+            "0",
+            "78",
+            "38"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.19.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19"
+      ],
+      "const_args": [
+        "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_38",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.19/input_layernorm/output_3.out4_37",
+        "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58"
+      ],
+      "const_args": [
+        "model.layers.19.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.19/post_attention_layernorm/output_3.out4_38",
+        "/model/layers.19/post_attention_layernorm/output_0.out4_38"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_19",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.19/post_attention_layernorm/output_0.out4_38"
+      ],
+      "const_args": [
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.19/mlp/Mul/output_0.out3_19"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.19.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/mlp/Mul/output_0.out3_19"
+      ],
+      "const_args": [
+        "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_39",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.19/post_attention_layernorm/output_3.out4_38",
+        "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59"
+      ],
+      "const_args": [
+        "model.layers.20.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.20/input_layernorm/output_3.out4_39",
+        "/model/layers.20/input_layernorm/output_0.out4_39"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_20",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/input_layernorm/output_0.out4_39"
+      ],
+      "const_args": [
+        "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.20.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/input_layernorm/output_0.out4_39"
+      ],
+      "const_args": [
+        "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.20.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "83",
+            "41"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.20/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60",
+        "past_key_values.20.key",
+        "past_key_values.20.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20",
+        "present.20.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "80",
+            "40",
+            "2",
+            "0",
+            "81",
+            "41",
+            "6",
+            "0",
+            "82",
+            "40"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.20.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20"
+      ],
+      "const_args": [
+        "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_40",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.20/input_layernorm/output_3.out4_39",
+        "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61"
+      ],
+      "const_args": [
+        "model.layers.20.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.20/post_attention_layernorm/output_3.out4_40",
+        "/model/layers.20/post_attention_layernorm/output_0.out4_40"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_20",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.20/post_attention_layernorm/output_0.out4_40"
+      ],
+      "const_args": [
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.20/mlp/Mul/output_0.out3_20"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.20.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/mlp/Mul/output_0.out3_20"
+      ],
+      "const_args": [
+        "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_41",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.20/post_attention_layernorm/output_3.out4_40",
+        "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62"
+      ],
+      "const_args": [
+        "model.layers.21.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.21/input_layernorm/output_3.out4_41",
+        "/model/layers.21/input_layernorm/output_0.out4_41"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_21",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/input_layernorm/output_0.out4_41"
+      ],
+      "const_args": [
+        "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.21.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/input_layernorm/output_0.out4_41"
+      ],
+      "const_args": [
+        "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.21.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "87",
+            "43"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.21/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63",
+        "past_key_values.21.key",
+        "past_key_values.21.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21",
+        "present.21.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "84",
+            "42",
+            "2",
+            "0",
+            "85",
+            "43",
+            "6",
+            "0",
+            "86",
+            "42"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.21.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21"
+      ],
+      "const_args": [
+        "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_42",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.21/input_layernorm/output_3.out4_41",
+        "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64"
+      ],
+      "const_args": [
+        "model.layers.21.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.21/post_attention_layernorm/output_3.out4_42",
+        "/model/layers.21/post_attention_layernorm/output_0.out4_42"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_21",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.21/post_attention_layernorm/output_0.out4_42"
+      ],
+      "const_args": [
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.21/mlp/Mul/output_0.out3_21"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.21.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/mlp/Mul/output_0.out3_21"
+      ],
+      "const_args": [
+        "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_43",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.21/post_attention_layernorm/output_3.out4_42",
+        "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65"
+      ],
+      "const_args": [
+        "model.layers.22.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.22/input_layernorm/output_3.out4_43",
+        "/model/layers.22/input_layernorm/output_0.out4_43"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_22",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/input_layernorm/output_0.out4_43"
+      ],
+      "const_args": [
+        "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.22.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/input_layernorm/output_0.out4_43"
+      ],
+      "const_args": [
+        "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.22.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "91",
+            "45"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.22/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66",
+        "past_key_values.22.key",
+        "past_key_values.22.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22",
+        "present.22.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "88",
+            "44",
+            "2",
+            "0",
+            "89",
+            "45",
+            "6",
+            "0",
+            "90",
+            "44"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.22.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22"
+      ],
+      "const_args": [
+        "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_44",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.22/input_layernorm/output_3.out4_43",
+        "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67"
+      ],
+      "const_args": [
+        "model.layers.22.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.22/post_attention_layernorm/output_3.out4_44",
+        "/model/layers.22/post_attention_layernorm/output_0.out4_44"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_22",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.22/post_attention_layernorm/output_0.out4_44"
+      ],
+      "const_args": [
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.22/mlp/Mul/output_0.out3_22"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.22.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/mlp/Mul/output_0.out3_22"
+      ],
+      "const_args": [
+        "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_45",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.22/post_attention_layernorm/output_3.out4_44",
+        "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68"
+      ],
+      "const_args": [
+        "model.layers.23.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.23/input_layernorm/output_3.out4_45",
+        "/model/layers.23/input_layernorm/output_0.out4_45"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_23",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/input_layernorm/output_0.out4_45"
+      ],
+      "const_args": [
+        "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.23.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/input_layernorm/output_0.out4_45"
+      ],
+      "const_args": [
+        "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.23.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "95",
+            "47"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.23/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69",
+        "past_key_values.23.key",
+        "past_key_values.23.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23",
+        "present.23.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "92",
+            "46",
+            "2",
+            "0",
+            "93",
+            "47",
+            "6",
+            "0",
+            "94",
+            "46"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.23.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23"
+      ],
+      "const_args": [
+        "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_46",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.23/input_layernorm/output_3.out4_45",
+        "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70"
+      ],
+      "const_args": [
+        "model.layers.23.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.23/post_attention_layernorm/output_3.out4_46",
+        "/model/layers.23/post_attention_layernorm/output_0.out4_46"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_23",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.23/post_attention_layernorm/output_0.out4_46"
+      ],
+      "const_args": [
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.23/mlp/Mul/output_0.out3_23"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.23.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/mlp/Mul/output_0.out3_23"
+      ],
+      "const_args": [
+        "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_47",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.23/post_attention_layernorm/output_3.out4_46",
+        "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71"
+      ],
+      "const_args": [
+        "model.layers.24.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.24/input_layernorm/output_3.out4_47",
+        "/model/layers.24/input_layernorm/output_0.out4_47"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_24",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/input_layernorm/output_0.out4_47"
+      ],
+      "const_args": [
+        "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.24.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/input_layernorm/output_0.out4_47"
+      ],
+      "const_args": [
+        "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.24.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "99",
+            "49"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.24/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72",
+        "past_key_values.24.key",
+        "past_key_values.24.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24",
+        "present.24.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "96",
+            "48",
+            "2",
+            "0",
+            "97",
+            "49",
+            "6",
+            "0",
+            "98",
+            "48"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.24.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24"
+      ],
+      "const_args": [
+        "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_48",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.24/input_layernorm/output_3.out4_47",
+        "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73"
+      ],
+      "const_args": [
+        "model.layers.24.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.24/post_attention_layernorm/output_3.out4_48",
+        "/model/layers.24/post_attention_layernorm/output_0.out4_48"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_24",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.24/post_attention_layernorm/output_0.out4_48"
+      ],
+      "const_args": [
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.24/mlp/Mul/output_0.out3_24"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.24.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/mlp/Mul/output_0.out3_24"
+      ],
+      "const_args": [
+        "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_49",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.24/post_attention_layernorm/output_3.out4_48",
+        "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74"
+      ],
+      "const_args": [
+        "model.layers.25.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.25/input_layernorm/output_3.out4_49",
+        "/model/layers.25/input_layernorm/output_0.out4_49"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_25",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/input_layernorm/output_0.out4_49"
+      ],
+      "const_args": [
+        "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.25.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/input_layernorm/output_0.out4_49"
+      ],
+      "const_args": [
+        "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.25.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "103",
+            "51"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.25/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75",
+        "past_key_values.25.key",
+        "past_key_values.25.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25",
+        "present.25.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "100",
+            "50",
+            "2",
+            "0",
+            "101",
+            "51",
+            "6",
+            "0",
+            "102",
+            "50"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.25.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25"
+      ],
+      "const_args": [
+        "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_50",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.25/input_layernorm/output_3.out4_49",
+        "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76"
+      ],
+      "const_args": [
+        "model.layers.25.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.25/post_attention_layernorm/output_3.out4_50",
+        "/model/layers.25/post_attention_layernorm/output_0.out4_50"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_25",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.25/post_attention_layernorm/output_0.out4_50"
+      ],
+      "const_args": [
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.25/mlp/Mul/output_0.out3_25"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.25.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/mlp/Mul/output_0.out3_25"
+      ],
+      "const_args": [
+        "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_51",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.25/post_attention_layernorm/output_3.out4_50",
+        "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77"
+      ],
+      "const_args": [
+        "model.layers.26.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.26/input_layernorm/output_3.out4_51",
+        "/model/layers.26/input_layernorm/output_0.out4_51"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_26",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/input_layernorm/output_0.out4_51"
+      ],
+      "const_args": [
+        "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.26.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/input_layernorm/output_0.out4_51"
+      ],
+      "const_args": [
+        "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.26.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "107",
+            "53"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.26/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78",
+        "past_key_values.26.key",
+        "past_key_values.26.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26",
+        "present.26.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "104",
+            "52",
+            "2",
+            "0",
+            "105",
+            "53",
+            "6",
+            "0",
+            "106",
+            "52"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.26.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26"
+      ],
+      "const_args": [
+        "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_52",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.26/input_layernorm/output_3.out4_51",
+        "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79"
+      ],
+      "const_args": [
+        "model.layers.26.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.26/post_attention_layernorm/output_3.out4_52",
+        "/model/layers.26/post_attention_layernorm/output_0.out4_52"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_26",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.26/post_attention_layernorm/output_0.out4_52"
+      ],
+      "const_args": [
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.26/mlp/Mul/output_0.out3_26"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.26.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/mlp/Mul/output_0.out3_26"
+      ],
+      "const_args": [
+        "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_53",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.26/post_attention_layernorm/output_3.out4_52",
+        "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80"
+      ],
+      "const_args": [
+        "model.layers.27.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.27/input_layernorm/output_3.out4_53",
+        "/model/layers.27/input_layernorm/output_0.out4_53"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_27",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/input_layernorm/output_0.out4_53"
+      ],
+      "const_args": [
+        "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.27.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/input_layernorm/output_0.out4_53"
+      ],
+      "const_args": [
+        "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.27.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "111",
+            "55"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.27/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81",
+        "past_key_values.27.key",
+        "past_key_values.27.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27",
+        "present.27.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "108",
+            "54",
+            "2",
+            "0",
+            "109",
+            "55",
+            "6",
+            "0",
+            "110",
+            "54"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.27.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27"
+      ],
+      "const_args": [
+        "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_54",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.27/input_layernorm/output_3.out4_53",
+        "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82"
+      ],
+      "const_args": [
+        "model.layers.27.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.27/post_attention_layernorm/output_3.out4_54",
+        "/model/layers.27/post_attention_layernorm/output_0.out4_54"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_27",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.27/post_attention_layernorm/output_0.out4_54"
+      ],
+      "const_args": [
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.27/mlp/Mul/output_0.out3_27"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.27.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/mlp/Mul/output_0.out3_27"
+      ],
+      "const_args": [
+        "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_55",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.27/post_attention_layernorm/output_3.out4_54",
+        "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83"
+      ],
+      "const_args": [
+        "model.layers.28.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.28/input_layernorm/output_3.out4_55",
+        "/model/layers.28/input_layernorm/output_0.out4_55"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_28",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/input_layernorm/output_0.out4_55"
+      ],
+      "const_args": [
+        "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.28.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/input_layernorm/output_0.out4_55"
+      ],
+      "const_args": [
+        "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.28.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "115",
+            "57"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.28/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84",
+        "past_key_values.28.key",
+        "past_key_values.28.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28",
+        "present.28.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "112",
+            "56",
+            "2",
+            "0",
+            "113",
+            "57",
+            "6",
+            "0",
+            "114",
+            "56"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.28.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28"
+      ],
+      "const_args": [
+        "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_56",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.28/input_layernorm/output_3.out4_55",
+        "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85"
+      ],
+      "const_args": [
+        "model.layers.28.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.28/post_attention_layernorm/output_3.out4_56",
+        "/model/layers.28/post_attention_layernorm/output_0.out4_56"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_28",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.28/post_attention_layernorm/output_0.out4_56"
+      ],
+      "const_args": [
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.28/mlp/Mul/output_0.out3_28"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.28.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/mlp/Mul/output_0.out3_28"
+      ],
+      "const_args": [
+        "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_57",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.28/post_attention_layernorm/output_3.out4_56",
+        "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86"
+      ],
+      "const_args": [
+        "model.layers.29.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.29/input_layernorm/output_3.out4_57",
+        "/model/layers.29/input_layernorm/output_0.out4_57"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_29",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/input_layernorm/output_0.out4_57"
+      ],
+      "const_args": [
+        "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.29.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/input_layernorm/output_0.out4_57"
+      ],
+      "const_args": [
+        "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.29.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "119",
+            "59"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.29/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87",
+        "past_key_values.29.key",
+        "past_key_values.29.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29",
+        "present.29.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "116",
+            "58",
+            "2",
+            "0",
+            "117",
+            "59",
+            "6",
+            "0",
+            "118",
+            "58"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.29.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29"
+      ],
+      "const_args": [
+        "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_58",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.29/input_layernorm/output_3.out4_57",
+        "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88"
+      ],
+      "const_args": [
+        "model.layers.29.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.29/post_attention_layernorm/output_3.out4_58",
+        "/model/layers.29/post_attention_layernorm/output_0.out4_58"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_29",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.29/post_attention_layernorm/output_0.out4_58"
+      ],
+      "const_args": [
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.29/mlp/Mul/output_0.out3_29"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.29.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/mlp/Mul/output_0.out3_29"
+      ],
+      "const_args": [
+        "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_59",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.29/post_attention_layernorm/output_3.out4_58",
+        "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89"
+      ],
+      "const_args": [
+        "model.layers.30.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.30/input_layernorm/output_3.out4_59",
+        "/model/layers.30/input_layernorm/output_0.out4_59"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_30",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/input_layernorm/output_0.out4_59"
+      ],
+      "const_args": [
+        "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.30.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/input_layernorm/output_0.out4_59"
+      ],
+      "const_args": [
+        "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.30.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "123",
+            "61"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.30/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90",
+        "past_key_values.30.key",
+        "past_key_values.30.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30",
+        "present.30.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "120",
+            "60",
+            "2",
+            "0",
+            "121",
+            "61",
+            "6",
+            "0",
+            "122",
+            "60"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.30.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30"
+      ],
+      "const_args": [
+        "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_60",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.30/input_layernorm/output_3.out4_59",
+        "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91"
+      ],
+      "const_args": [
+        "model.layers.30.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.30/post_attention_layernorm/output_3.out4_60",
+        "/model/layers.30/post_attention_layernorm/output_0.out4_60"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_30",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.30/post_attention_layernorm/output_0.out4_60"
+      ],
+      "const_args": [
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.30/mlp/Mul/output_0.out3_30"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.30.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/mlp/Mul/output_0.out3_30"
+      ],
+      "const_args": [
+        "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_61",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.30/post_attention_layernorm/output_3.out4_60",
+        "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92"
+      ],
+      "const_args": [
+        "model.layers.31.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.31/input_layernorm/output_3.out4_61",
+        "/model/layers.31/input_layernorm/output_0.out4_61"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_31",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/input_layernorm/output_0.out4_61"
+      ],
+      "const_args": [
+        "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "6144"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.31.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/input_layernorm/output_0.out4_61"
+      ],
+      "const_args": [
+        "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.31.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "127",
+            "63"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.31/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93",
+        "past_key_values.31.key",
+        "past_key_values.31.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31",
+        "present.31.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.10206207633018494"
+          ]
+        },
+        "local_window_size": {
+          "type": "int",
+          "value": [
+            "262144"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "32",
+            "32",
+            "1",
+            "4096",
+            "96"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "124",
+            "62",
+            "2",
+            "0",
+            "125",
+            "63",
+            "6",
+            "0",
+            "126",
+            "62"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "192",
+            "6",
+            "0",
+            "0",
+            "192"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.31.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31"
+      ],
+      "const_args": [
+        "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_62",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.31/input_layernorm/output_3.out4_61",
+        "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94"
+      ],
+      "const_args": [
+        "model.layers.31.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.31/post_attention_layernorm/output_3.out4_62",
+        "/model/layers.31/post_attention_layernorm/output_0.out4_62"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_31",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.31/post_attention_layernorm/output_0.out4_62"
+      ],
+      "const_args": [
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.31/mlp/Mul/output_0.out3_31"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "3072",
+            "8192"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.31.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/mlp/Mul/output_0.out3_31"
+      ],
+      "const_args": [
+        "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "8192"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_63",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.31/post_attention_layernorm/output_3.out4_62",
+        "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95"
+      ],
+      "const_args": [
+        "model.layers.32.final_norm_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.32/final_norm_layernorm/output_0.dummy",
+        "/model/layers.32/final_norm_layernorm/output_0.out4_63"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "3072"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/lm_head/MatMulNBits",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.32/final_norm_layernorm/output_0.out4_63"
+      ],
+      "const_args": [
+        "lm_head.MatMulNBits.qweight.preformat",
+        "lm_head.MatMulNBits.bias.preformat",
+        "lm_head.MatMulNBits.scales.preformat",
+        "lm_head.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "logits.out5_4_96"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "3072"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "32064"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    }
+  ],
+  "fused_tensors": {
+    "in": {
+      "buffer_size": 18560,
+      "xrt_arg_id": 0,
+      "packed_tensors": [
+        "/model/layers.0/input_layernorm/output_0.out5_4_0",
+        "attention_mask_const_uint",
+        "/model/embed_tokens/Gather/output_0.out4_0"
+      ]
+    },
+    "out": {
+      "buffer_size": 70272,
+      "xrt_arg_id": 1,
+      "packed_tensors": [
+        "/model/layers.32/final_norm_layernorm/output_0.dummy",
+        "logits.out5_4_96"
+      ]
+    },
+    "scratch": {
+      "buffer_size": 2287616,
+      "xrt_arg_id": 2,
+      "packed_tensors": [
+        "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0",
+        "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0",
+        "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1",
+        "/model/layers.0/post_attention_layernorm/output_3.out4_0",
+        "/model/layers.0/post_attention_layernorm/output_0.out4_0",
+        "/model/layers.0/mlp/Mul/output_0.out3_0",
+        "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2",
+        "/model/layers.1/input_layernorm/output_3.out4_1",
+        "/model/layers.1/input_layernorm/output_0.out4_1",
+        "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3",
+        "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1",
+        "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4",
+        "/model/layers.1/post_attention_layernorm/output_3.out4_2",
+        "/model/layers.1/post_attention_layernorm/output_0.out4_2",
+        "/model/layers.1/mlp/Mul/output_0.out3_1",
+        "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5",
+        "/model/layers.2/input_layernorm/output_3.out4_3",
+        "/model/layers.2/input_layernorm/output_0.out4_3",
+        "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6",
+        "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2",
+        "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7",
+        "/model/layers.2/post_attention_layernorm/output_3.out4_4",
+        "/model/layers.2/post_attention_layernorm/output_0.out4_4",
+        "/model/layers.2/mlp/Mul/output_0.out3_2",
+        "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8",
+        "/model/layers.3/input_layernorm/output_3.out4_5",
+        "/model/layers.3/input_layernorm/output_0.out4_5",
+        "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9",
+        "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3",
+        "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10",
+        "/model/layers.3/post_attention_layernorm/output_3.out4_6",
+        "/model/layers.3/post_attention_layernorm/output_0.out4_6",
+        "/model/layers.3/mlp/Mul/output_0.out3_3",
+        "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11",
+        "/model/layers.4/input_layernorm/output_3.out4_7",
+        "/model/layers.4/input_layernorm/output_0.out4_7",
+        "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12",
+        "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4",
+        "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13",
+        "/model/layers.4/post_attention_layernorm/output_3.out4_8",
+        "/model/layers.4/post_attention_layernorm/output_0.out4_8",
+        "/model/layers.4/mlp/Mul/output_0.out3_4",
+        "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14",
+        "/model/layers.5/input_layernorm/output_3.out4_9",
+        "/model/layers.5/input_layernorm/output_0.out4_9",
+        "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15",
+        "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5",
+        "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16",
+        "/model/layers.5/post_attention_layernorm/output_3.out4_10",
+        "/model/layers.5/post_attention_layernorm/output_0.out4_10",
+        "/model/layers.5/mlp/Mul/output_0.out3_5",
+        "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17",
+        "/model/layers.6/input_layernorm/output_3.out4_11",
+        "/model/layers.6/input_layernorm/output_0.out4_11",
+        "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18",
+        "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6",
+        "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19",
+        "/model/layers.6/post_attention_layernorm/output_3.out4_12",
+        "/model/layers.6/post_attention_layernorm/output_0.out4_12",
+        "/model/layers.6/mlp/Mul/output_0.out3_6",
+        "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20",
+        "/model/layers.7/input_layernorm/output_3.out4_13",
+        "/model/layers.7/input_layernorm/output_0.out4_13",
+        "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21",
+        "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7",
+        "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22",
+        "/model/layers.7/post_attention_layernorm/output_3.out4_14",
+        "/model/layers.7/post_attention_layernorm/output_0.out4_14",
+        "/model/layers.7/mlp/Mul/output_0.out3_7",
+        "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23",
+        "/model/layers.8/input_layernorm/output_3.out4_15",
+        "/model/layers.8/input_layernorm/output_0.out4_15",
+        "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24",
+        "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8",
+        "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25",
+        "/model/layers.8/post_attention_layernorm/output_3.out4_16",
+        "/model/layers.8/post_attention_layernorm/output_0.out4_16",
+        "/model/layers.8/mlp/Mul/output_0.out3_8",
+        "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26",
+        "/model/layers.9/input_layernorm/output_3.out4_17",
+        "/model/layers.9/input_layernorm/output_0.out4_17",
+        "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27",
+        "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9",
+        "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28",
+        "/model/layers.9/post_attention_layernorm/output_3.out4_18",
+        "/model/layers.9/post_attention_layernorm/output_0.out4_18",
+        "/model/layers.9/mlp/Mul/output_0.out3_9",
+        "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29",
+        "/model/layers.10/input_layernorm/output_3.out4_19",
+        "/model/layers.10/input_layernorm/output_0.out4_19",
+        "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30",
+        "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10",
+        "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31",
+        "/model/layers.10/post_attention_layernorm/output_3.out4_20",
+        "/model/layers.10/post_attention_layernorm/output_0.out4_20",
+        "/model/layers.10/mlp/Mul/output_0.out3_10",
+        "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32",
+        "/model/layers.11/input_layernorm/output_3.out4_21",
+        "/model/layers.11/input_layernorm/output_0.out4_21",
+        "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33",
+        "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11",
+        "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34",
+        "/model/layers.11/post_attention_layernorm/output_3.out4_22",
+        "/model/layers.11/post_attention_layernorm/output_0.out4_22",
+        "/model/layers.11/mlp/Mul/output_0.out3_11",
+        "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35",
+        "/model/layers.12/input_layernorm/output_3.out4_23",
+        "/model/layers.12/input_layernorm/output_0.out4_23",
+        "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36",
+        "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12",
+        "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37",
+        "/model/layers.12/post_attention_layernorm/output_3.out4_24",
+        "/model/layers.12/post_attention_layernorm/output_0.out4_24",
+        "/model/layers.12/mlp/Mul/output_0.out3_12",
+        "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38",
+        "/model/layers.13/input_layernorm/output_3.out4_25",
+        "/model/layers.13/input_layernorm/output_0.out4_25",
+        "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39",
+        "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13",
+        "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40",
+        "/model/layers.13/post_attention_layernorm/output_3.out4_26",
+        "/model/layers.13/post_attention_layernorm/output_0.out4_26",
+        "/model/layers.13/mlp/Mul/output_0.out3_13",
+        "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41",
+        "/model/layers.14/input_layernorm/output_3.out4_27",
+        "/model/layers.14/input_layernorm/output_0.out4_27",
+        "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42",
+        "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14",
+        "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43",
+        "/model/layers.14/post_attention_layernorm/output_3.out4_28",
+        "/model/layers.14/post_attention_layernorm/output_0.out4_28",
+        "/model/layers.14/mlp/Mul/output_0.out3_14",
+        "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44",
+        "/model/layers.15/input_layernorm/output_3.out4_29",
+        "/model/layers.15/input_layernorm/output_0.out4_29",
+        "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45",
+        "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15",
+        "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46",
+        "/model/layers.15/post_attention_layernorm/output_3.out4_30",
+        "/model/layers.15/post_attention_layernorm/output_0.out4_30",
+        "/model/layers.15/mlp/Mul/output_0.out3_15",
+        "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47",
+        "/model/layers.16/input_layernorm/output_3.out4_31",
+        "/model/layers.16/input_layernorm/output_0.out4_31",
+        "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48",
+        "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16",
+        "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49",
+        "/model/layers.16/post_attention_layernorm/output_3.out4_32",
+        "/model/layers.16/post_attention_layernorm/output_0.out4_32",
+        "/model/layers.16/mlp/Mul/output_0.out3_16",
+        "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50",
+        "/model/layers.17/input_layernorm/output_3.out4_33",
+        "/model/layers.17/input_layernorm/output_0.out4_33",
+        "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51",
+        "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17",
+        "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52",
+        "/model/layers.17/post_attention_layernorm/output_3.out4_34",
+        "/model/layers.17/post_attention_layernorm/output_0.out4_34",
+        "/model/layers.17/mlp/Mul/output_0.out3_17",
+        "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53",
+        "/model/layers.18/input_layernorm/output_3.out4_35",
+        "/model/layers.18/input_layernorm/output_0.out4_35",
+        "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54",
+        "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18",
+        "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55",
+        "/model/layers.18/post_attention_layernorm/output_3.out4_36",
+        "/model/layers.18/post_attention_layernorm/output_0.out4_36",
+        "/model/layers.18/mlp/Mul/output_0.out3_18",
+        "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56",
+        "/model/layers.19/input_layernorm/output_3.out4_37",
+        "/model/layers.19/input_layernorm/output_0.out4_37",
+        "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57",
+        "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19",
+        "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58",
+        "/model/layers.19/post_attention_layernorm/output_3.out4_38",
+        "/model/layers.19/post_attention_layernorm/output_0.out4_38",
+        "/model/layers.19/mlp/Mul/output_0.out3_19",
+        "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59",
+        "/model/layers.20/input_layernorm/output_3.out4_39",
+        "/model/layers.20/input_layernorm/output_0.out4_39",
+        "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60",
+        "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20",
+        "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61",
+        "/model/layers.20/post_attention_layernorm/output_3.out4_40",
+        "/model/layers.20/post_attention_layernorm/output_0.out4_40",
+        "/model/layers.20/mlp/Mul/output_0.out3_20",
+        "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62",
+        "/model/layers.21/input_layernorm/output_3.out4_41",
+        "/model/layers.21/input_layernorm/output_0.out4_41",
+        "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63",
+        "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21",
+        "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64",
+        "/model/layers.21/post_attention_layernorm/output_3.out4_42",
+        "/model/layers.21/post_attention_layernorm/output_0.out4_42",
+        "/model/layers.21/mlp/Mul/output_0.out3_21",
+        "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65",
+        "/model/layers.22/input_layernorm/output_3.out4_43",
+        "/model/layers.22/input_layernorm/output_0.out4_43",
+        "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66",
+        "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22",
+        "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67",
+        "/model/layers.22/post_attention_layernorm/output_3.out4_44",
+        "/model/layers.22/post_attention_layernorm/output_0.out4_44",
+        "/model/layers.22/mlp/Mul/output_0.out3_22",
+        "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68",
+        "/model/layers.23/input_layernorm/output_3.out4_45",
+        "/model/layers.23/input_layernorm/output_0.out4_45",
+        "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69",
+        "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23",
+        "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70",
+        "/model/layers.23/post_attention_layernorm/output_3.out4_46",
+        "/model/layers.23/post_attention_layernorm/output_0.out4_46",
+        "/model/layers.23/mlp/Mul/output_0.out3_23",
+        "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71",
+        "/model/layers.24/input_layernorm/output_3.out4_47",
+        "/model/layers.24/input_layernorm/output_0.out4_47",
+        "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72",
+        "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24",
+        "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73",
+        "/model/layers.24/post_attention_layernorm/output_3.out4_48",
+        "/model/layers.24/post_attention_layernorm/output_0.out4_48",
+        "/model/layers.24/mlp/Mul/output_0.out3_24",
+        "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74",
+        "/model/layers.25/input_layernorm/output_3.out4_49",
+        "/model/layers.25/input_layernorm/output_0.out4_49",
+        "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75",
+        "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25",
+        "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76",
+        "/model/layers.25/post_attention_layernorm/output_3.out4_50",
+        "/model/layers.25/post_attention_layernorm/output_0.out4_50",
+        "/model/layers.25/mlp/Mul/output_0.out3_25",
+        "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77",
+        "/model/layers.26/input_layernorm/output_3.out4_51",
+        "/model/layers.26/input_layernorm/output_0.out4_51",
+        "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78",
+        "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26",
+        "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79",
+        "/model/layers.26/post_attention_layernorm/output_3.out4_52",
+        "/model/layers.26/post_attention_layernorm/output_0.out4_52",
+        "/model/layers.26/mlp/Mul/output_0.out3_26",
+        "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80",
+        "/model/layers.27/input_layernorm/output_3.out4_53",
+        "/model/layers.27/input_layernorm/output_0.out4_53",
+        "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81",
+        "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27",
+        "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82",
+        "/model/layers.27/post_attention_layernorm/output_3.out4_54",
+        "/model/layers.27/post_attention_layernorm/output_0.out4_54",
+        "/model/layers.27/mlp/Mul/output_0.out3_27",
+        "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83",
+        "/model/layers.28/input_layernorm/output_3.out4_55",
+        "/model/layers.28/input_layernorm/output_0.out4_55",
+        "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84",
+        "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28",
+        "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85",
+        "/model/layers.28/post_attention_layernorm/output_3.out4_56",
+        "/model/layers.28/post_attention_layernorm/output_0.out4_56",
+        "/model/layers.28/mlp/Mul/output_0.out3_28",
+        "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86",
+        "/model/layers.29/input_layernorm/output_3.out4_57",
+        "/model/layers.29/input_layernorm/output_0.out4_57",
+        "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87",
+        "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29",
+        "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88",
+        "/model/layers.29/post_attention_layernorm/output_3.out4_58",
+        "/model/layers.29/post_attention_layernorm/output_0.out4_58",
+        "/model/layers.29/mlp/Mul/output_0.out3_29",
+        "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89",
+        "/model/layers.30/input_layernorm/output_3.out4_59",
+        "/model/layers.30/input_layernorm/output_0.out4_59",
+        "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90",
+        "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30",
+        "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91",
+        "/model/layers.30/post_attention_layernorm/output_3.out4_60",
+        "/model/layers.30/post_attention_layernorm/output_0.out4_60",
+        "/model/layers.30/mlp/Mul/output_0.out3_30",
+        "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92",
+        "/model/layers.31/input_layernorm/output_3.out4_61",
+        "/model/layers.31/input_layernorm/output_0.out4_61",
+        "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93",
+        "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31",
+        "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94",
+        "/model/layers.31/post_attention_layernorm/output_3.out4_62",
+        "/model/layers.31/post_attention_layernorm/output_0.out4_62",
+        "/model/layers.31/mlp/Mul/output_0.out3_31",
+        "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95",
+        "/model/layers.32/final_norm_layernorm/output_0.out4_63"
+      ]
+    },
+    "const": {
+      "buffer_size": 3060771584,
+      "xrt_arg_id": 3,
+      "packed_tensors": [
+        "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.0.post_attention_layernorm.weight.bf",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.input_layernorm.weight.bf",
+        "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.post_attention_layernorm.weight.bf",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.input_layernorm.weight.bf",
+        "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.post_attention_layernorm.weight.bf",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.input_layernorm.weight.bf",
+        "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.post_attention_layernorm.weight.bf",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.input_layernorm.weight.bf",
+        "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.post_attention_layernorm.weight.bf",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.input_layernorm.weight.bf",
+        "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.post_attention_layernorm.weight.bf",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.input_layernorm.weight.bf",
+        "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.post_attention_layernorm.weight.bf",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.input_layernorm.weight.bf",
+        "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.post_attention_layernorm.weight.bf",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.input_layernorm.weight.bf",
+        "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.post_attention_layernorm.weight.bf",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.input_layernorm.weight.bf",
+        "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.post_attention_layernorm.weight.bf",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.input_layernorm.weight.bf",
+        "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.post_attention_layernorm.weight.bf",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.input_layernorm.weight.bf",
+        "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.post_attention_layernorm.weight.bf",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.input_layernorm.weight.bf",
+        "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.post_attention_layernorm.weight.bf",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.input_layernorm.weight.bf",
+        "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.post_attention_layernorm.weight.bf",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.input_layernorm.weight.bf",
+        "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.post_attention_layernorm.weight.bf",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.input_layernorm.weight.bf",
+        "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.post_attention_layernorm.weight.bf",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.input_layernorm.weight.bf",
+        "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.post_attention_layernorm.weight.bf",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.input_layernorm.weight.bf",
+        "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.post_attention_layernorm.weight.bf",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.input_layernorm.weight.bf",
+        "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.post_attention_layernorm.weight.bf",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.input_layernorm.weight.bf",
+        "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.post_attention_layernorm.weight.bf",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.input_layernorm.weight.bf",
+        "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.post_attention_layernorm.weight.bf",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.input_layernorm.weight.bf",
+        "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.post_attention_layernorm.weight.bf",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.input_layernorm.weight.bf",
+        "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.post_attention_layernorm.weight.bf",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.input_layernorm.weight.bf",
+        "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.post_attention_layernorm.weight.bf",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.input_layernorm.weight.bf",
+        "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.post_attention_layernorm.weight.bf",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.input_layernorm.weight.bf",
+        "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.post_attention_layernorm.weight.bf",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.input_layernorm.weight.bf",
+        "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.post_attention_layernorm.weight.bf",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.input_layernorm.weight.bf",
+        "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.post_attention_layernorm.weight.bf",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.input_layernorm.weight.bf",
+        "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.post_attention_layernorm.weight.bf",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.input_layernorm.weight.bf",
+        "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.post_attention_layernorm.weight.bf",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.input_layernorm.weight.bf",
+        "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.post_attention_layernorm.weight.bf",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.input_layernorm.weight.bf",
+        "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.post_attention_layernorm.weight.bf",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.32.final_norm_layernorm.weight.bf",
+        "lm_head.MatMulNBits.qweight.preformat",
+        "lm_head.MatMulNBits.bias.preformat",
+        "lm_head.MatMulNBits.scales.preformat",
+        "lm_head.MatMulNBits.qzeros.preformat"
+      ]
+    },
+    "super_instr": {
+      "buffer_size": 0,
+      "xrt_arg_id": 4,
+      "packed_tensors": []
+    },
+    "ext_buf_0": {
+      "buffer_size": 1610612736,
+      "xrt_arg_id": 5,
+      "packed_tensors": [
+        "past_key_values.0.key",
+        "past_key_values.0.value",
+        "present.0.key",
+        "present.0.value",
+        "past_key_values.1.key",
+        "past_key_values.1.value",
+        "present.1.key",
+        "present.1.value",
+        "past_key_values.2.key",
+        "past_key_values.2.value",
+        "present.2.key",
+        "present.2.value",
+        "past_key_values.3.key",
+        "past_key_values.3.value",
+        "present.3.key",
+        "present.3.value",
+        "past_key_values.4.key",
+        "past_key_values.4.value",
+        "present.4.key",
+        "present.4.value",
+        "past_key_values.5.key",
+        "past_key_values.5.value",
+        "present.5.key",
+        "present.5.value",
+        "past_key_values.6.key",
+        "past_key_values.6.value",
+        "present.6.key",
+        "present.6.value",
+        "past_key_values.7.key",
+        "past_key_values.7.value",
+        "present.7.key",
+        "present.7.value",
+        "past_key_values.8.key",
+        "past_key_values.8.value",
+        "present.8.key",
+        "present.8.value",
+        "past_key_values.9.key",
+        "past_key_values.9.value",
+        "present.9.key",
+        "present.9.value",
+        "past_key_values.10.key",
+        "past_key_values.10.value",
+        "present.10.key",
+        "present.10.value",
+        "past_key_values.11.key",
+        "past_key_values.11.value",
+        "present.11.key",
+        "present.11.value",
+        "past_key_values.12.key",
+        "past_key_values.12.value",
+        "present.12.key",
+        "present.12.value",
+        "past_key_values.13.key",
+        "past_key_values.13.value",
+        "present.13.key",
+        "present.13.value",
+        "past_key_values.14.key",
+        "past_key_values.14.value",
+        "present.14.key",
+        "present.14.value",
+        "past_key_values.15.key",
+        "past_key_values.15.value",
+        "present.15.key",
+        "present.15.value",
+        "past_key_values.16.key",
+        "past_key_values.16.value",
+        "present.16.key",
+        "present.16.value",
+        "past_key_values.17.key",
+        "past_key_values.17.value",
+        "present.17.key",
+        "present.17.value",
+        "past_key_values.18.key",
+        "past_key_values.18.value",
+        "present.18.key",
+        "present.18.value",
+        "past_key_values.19.key",
+        "past_key_values.19.value",
+        "present.19.key",
+        "present.19.value",
+        "past_key_values.20.key",
+        "past_key_values.20.value",
+        "present.20.key",
+        "present.20.value",
+        "past_key_values.21.key",
+        "past_key_values.21.value",
+        "present.21.key",
+        "present.21.value",
+        "past_key_values.22.key",
+        "past_key_values.22.value",
+        "present.22.key",
+        "present.22.value",
+        "past_key_values.23.key",
+        "past_key_values.23.value",
+        "present.23.key",
+        "present.23.value",
+        "past_key_values.24.key",
+        "past_key_values.24.value",
+        "present.24.key",
+        "present.24.value",
+        "past_key_values.25.key",
+        "past_key_values.25.value",
+        "present.25.key",
+        "present.25.value",
+        "past_key_values.26.key",
+        "past_key_values.26.value",
+        "present.26.key",
+        "present.26.value",
+        "past_key_values.27.key",
+        "past_key_values.27.value",
+        "present.27.key",
+        "present.27.value",
+        "past_key_values.28.key",
+        "past_key_values.28.value",
+        "present.28.key",
+        "present.28.value",
+        "past_key_values.29.key",
+        "past_key_values.29.value",
+        "present.29.key",
+        "present.29.value",
+        "past_key_values.30.key",
+        "past_key_values.30.value",
+        "present.30.key",
+        "present.30.value",
+        "past_key_values.31.key",
+        "past_key_values.31.value",
+        "present.31.key",
+        "present.31.value"
+      ]
+    },
+    "ext_buf_1": {
+      "buffer_size": 25952256,
+      "xrt_arg_id": 6,
+      "packed_tensors": [
+        "sin_cos_cache_token"
+      ]
+    }
+  },
+  "tensor_map": {
+    "/model/layers.0/input_layernorm/output_0.out5_4_0": {
+      "packed_buffer_label": "in",
+      "xrt_arg_id": 0,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 6144
+    },
+    "attention_mask_const_uint": {
+      "packed_buffer_label": "in",
+      "xrt_arg_id": 0,
+      "dtype": "uint32",
+      "shape": [
+        1
+      ],
+      "size_in_bytes": 4,
+      "op_tensor_size": 4,
+      "offset": 18556
+    },
+    "/model/embed_tokens/Gather/output_0.out4_0": {
+      "packed_buffer_label": "in",
+      "xrt_arg_id": 0,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 12292
+    },
+    "/model/layers.32/final_norm_layernorm/output_0.dummy": {
+      "packed_buffer_label": "out",
+      "xrt_arg_id": 1,
+      "dtype": "float16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 0
+    },
+    "logits.out5_4_96": {
+      "packed_buffer_label": "out",
+      "xrt_arg_id": 1,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        32064
+      ],
+      "size_in_bytes": 64128,
+      "op_tensor_size": 64128,
+      "offset": 6144
+    },
+    "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 0
+    },
+    "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 12288
+    },
+    "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 18432
+    },
+    "/model/layers.0/post_attention_layernorm/output_3.out4_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 24576
+    },
+    "/model/layers.0/post_attention_layernorm/output_0.out4_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 30720
+    },
+    "/model/layers.0/mlp/Mul/output_0.out3_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 36864
+    },
+    "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 53248
+    },
+    "/model/layers.1/input_layernorm/output_3.out4_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 59392
+    },
+    "/model/layers.1/input_layernorm/output_0.out4_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 65536
+    },
+    "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 71680
+    },
+    "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 83968
+    },
+    "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 90112
+    },
+    "/model/layers.1/post_attention_layernorm/output_3.out4_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 96256
+    },
+    "/model/layers.1/post_attention_layernorm/output_0.out4_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 102400
+    },
+    "/model/layers.1/mlp/Mul/output_0.out3_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 108544
+    },
+    "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 124928
+    },
+    "/model/layers.2/input_layernorm/output_3.out4_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 131072
+    },
+    "/model/layers.2/input_layernorm/output_0.out4_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 137216
+    },
+    "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 143360
+    },
+    "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 155648
+    },
+    "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 161792
+    },
+    "/model/layers.2/post_attention_layernorm/output_3.out4_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 167936
+    },
+    "/model/layers.2/post_attention_layernorm/output_0.out4_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 174080
+    },
+    "/model/layers.2/mlp/Mul/output_0.out3_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 180224
+    },
+    "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 196608
+    },
+    "/model/layers.3/input_layernorm/output_3.out4_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 202752
+    },
+    "/model/layers.3/input_layernorm/output_0.out4_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 208896
+    },
+    "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 215040
+    },
+    "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 227328
+    },
+    "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 233472
+    },
+    "/model/layers.3/post_attention_layernorm/output_3.out4_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 239616
+    },
+    "/model/layers.3/post_attention_layernorm/output_0.out4_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 245760
+    },
+    "/model/layers.3/mlp/Mul/output_0.out3_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 251904
+    },
+    "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 268288
+    },
+    "/model/layers.4/input_layernorm/output_3.out4_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 274432
+    },
+    "/model/layers.4/input_layernorm/output_0.out4_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 280576
+    },
+    "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 286720
+    },
+    "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 299008
+    },
+    "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 305152
+    },
+    "/model/layers.4/post_attention_layernorm/output_3.out4_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 311296
+    },
+    "/model/layers.4/post_attention_layernorm/output_0.out4_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 317440
+    },
+    "/model/layers.4/mlp/Mul/output_0.out3_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 323584
+    },
+    "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 339968
+    },
+    "/model/layers.5/input_layernorm/output_3.out4_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 346112
+    },
+    "/model/layers.5/input_layernorm/output_0.out4_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 352256
+    },
+    "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 358400
+    },
+    "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 370688
+    },
+    "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 376832
+    },
+    "/model/layers.5/post_attention_layernorm/output_3.out4_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 382976
+    },
+    "/model/layers.5/post_attention_layernorm/output_0.out4_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 389120
+    },
+    "/model/layers.5/mlp/Mul/output_0.out3_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 395264
+    },
+    "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 411648
+    },
+    "/model/layers.6/input_layernorm/output_3.out4_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 417792
+    },
+    "/model/layers.6/input_layernorm/output_0.out4_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 423936
+    },
+    "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 430080
+    },
+    "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 442368
+    },
+    "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 448512
+    },
+    "/model/layers.6/post_attention_layernorm/output_3.out4_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 454656
+    },
+    "/model/layers.6/post_attention_layernorm/output_0.out4_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 460800
+    },
+    "/model/layers.6/mlp/Mul/output_0.out3_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 466944
+    },
+    "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 483328
+    },
+    "/model/layers.7/input_layernorm/output_3.out4_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 489472
+    },
+    "/model/layers.7/input_layernorm/output_0.out4_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 495616
+    },
+    "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 501760
+    },
+    "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 514048
+    },
+    "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 520192
+    },
+    "/model/layers.7/post_attention_layernorm/output_3.out4_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 526336
+    },
+    "/model/layers.7/post_attention_layernorm/output_0.out4_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 532480
+    },
+    "/model/layers.7/mlp/Mul/output_0.out3_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 538624
+    },
+    "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 555008
+    },
+    "/model/layers.8/input_layernorm/output_3.out4_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 561152
+    },
+    "/model/layers.8/input_layernorm/output_0.out4_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 567296
+    },
+    "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 573440
+    },
+    "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 585728
+    },
+    "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 591872
+    },
+    "/model/layers.8/post_attention_layernorm/output_3.out4_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 598016
+    },
+    "/model/layers.8/post_attention_layernorm/output_0.out4_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 604160
+    },
+    "/model/layers.8/mlp/Mul/output_0.out3_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 610304
+    },
+    "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 626688
+    },
+    "/model/layers.9/input_layernorm/output_3.out4_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 632832
+    },
+    "/model/layers.9/input_layernorm/output_0.out4_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 638976
+    },
+    "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 645120
+    },
+    "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 657408
+    },
+    "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 663552
+    },
+    "/model/layers.9/post_attention_layernorm/output_3.out4_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 669696
+    },
+    "/model/layers.9/post_attention_layernorm/output_0.out4_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 675840
+    },
+    "/model/layers.9/mlp/Mul/output_0.out3_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 681984
+    },
+    "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 698368
+    },
+    "/model/layers.10/input_layernorm/output_3.out4_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 704512
+    },
+    "/model/layers.10/input_layernorm/output_0.out4_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 710656
+    },
+    "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 716800
+    },
+    "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 729088
+    },
+    "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 735232
+    },
+    "/model/layers.10/post_attention_layernorm/output_3.out4_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 741376
+    },
+    "/model/layers.10/post_attention_layernorm/output_0.out4_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 747520
+    },
+    "/model/layers.10/mlp/Mul/output_0.out3_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 753664
+    },
+    "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 770048
+    },
+    "/model/layers.11/input_layernorm/output_3.out4_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 776192
+    },
+    "/model/layers.11/input_layernorm/output_0.out4_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 782336
+    },
+    "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 788480
+    },
+    "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 800768
+    },
+    "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 806912
+    },
+    "/model/layers.11/post_attention_layernorm/output_3.out4_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 813056
+    },
+    "/model/layers.11/post_attention_layernorm/output_0.out4_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 819200
+    },
+    "/model/layers.11/mlp/Mul/output_0.out3_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 825344
+    },
+    "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 841728
+    },
+    "/model/layers.12/input_layernorm/output_3.out4_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 847872
+    },
+    "/model/layers.12/input_layernorm/output_0.out4_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 854016
+    },
+    "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 860160
+    },
+    "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 872448
+    },
+    "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 878592
+    },
+    "/model/layers.12/post_attention_layernorm/output_3.out4_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 884736
+    },
+    "/model/layers.12/post_attention_layernorm/output_0.out4_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 890880
+    },
+    "/model/layers.12/mlp/Mul/output_0.out3_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 897024
+    },
+    "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 913408
+    },
+    "/model/layers.13/input_layernorm/output_3.out4_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 919552
+    },
+    "/model/layers.13/input_layernorm/output_0.out4_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 925696
+    },
+    "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 931840
+    },
+    "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 944128
+    },
+    "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 950272
+    },
+    "/model/layers.13/post_attention_layernorm/output_3.out4_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 956416
+    },
+    "/model/layers.13/post_attention_layernorm/output_0.out4_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 962560
+    },
+    "/model/layers.13/mlp/Mul/output_0.out3_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 968704
+    },
+    "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 985088
+    },
+    "/model/layers.14/input_layernorm/output_3.out4_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 991232
+    },
+    "/model/layers.14/input_layernorm/output_0.out4_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 997376
+    },
+    "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1003520
+    },
+    "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1015808
+    },
+    "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1021952
+    },
+    "/model/layers.14/post_attention_layernorm/output_3.out4_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1028096
+    },
+    "/model/layers.14/post_attention_layernorm/output_0.out4_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1034240
+    },
+    "/model/layers.14/mlp/Mul/output_0.out3_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1040384
+    },
+    "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1056768
+    },
+    "/model/layers.15/input_layernorm/output_3.out4_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1062912
+    },
+    "/model/layers.15/input_layernorm/output_0.out4_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1069056
+    },
+    "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1075200
+    },
+    "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1087488
+    },
+    "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1093632
+    },
+    "/model/layers.15/post_attention_layernorm/output_3.out4_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1099776
+    },
+    "/model/layers.15/post_attention_layernorm/output_0.out4_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1105920
+    },
+    "/model/layers.15/mlp/Mul/output_0.out3_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1112064
+    },
+    "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1128448
+    },
+    "/model/layers.16/input_layernorm/output_3.out4_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1134592
+    },
+    "/model/layers.16/input_layernorm/output_0.out4_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1140736
+    },
+    "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1146880
+    },
+    "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1159168
+    },
+    "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1165312
+    },
+    "/model/layers.16/post_attention_layernorm/output_3.out4_32": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1171456
+    },
+    "/model/layers.16/post_attention_layernorm/output_0.out4_32": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1177600
+    },
+    "/model/layers.16/mlp/Mul/output_0.out3_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1183744
+    },
+    "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1200128
+    },
+    "/model/layers.17/input_layernorm/output_3.out4_33": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1206272
+    },
+    "/model/layers.17/input_layernorm/output_0.out4_33": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1212416
+    },
+    "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1218560
+    },
+    "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1230848
+    },
+    "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1236992
+    },
+    "/model/layers.17/post_attention_layernorm/output_3.out4_34": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1243136
+    },
+    "/model/layers.17/post_attention_layernorm/output_0.out4_34": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1249280
+    },
+    "/model/layers.17/mlp/Mul/output_0.out3_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1255424
+    },
+    "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1271808
+    },
+    "/model/layers.18/input_layernorm/output_3.out4_35": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1277952
+    },
+    "/model/layers.18/input_layernorm/output_0.out4_35": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1284096
+    },
+    "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1290240
+    },
+    "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1302528
+    },
+    "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1308672
+    },
+    "/model/layers.18/post_attention_layernorm/output_3.out4_36": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1314816
+    },
+    "/model/layers.18/post_attention_layernorm/output_0.out4_36": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1320960
+    },
+    "/model/layers.18/mlp/Mul/output_0.out3_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1327104
+    },
+    "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1343488
+    },
+    "/model/layers.19/input_layernorm/output_3.out4_37": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1349632
+    },
+    "/model/layers.19/input_layernorm/output_0.out4_37": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1355776
+    },
+    "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1361920
+    },
+    "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1374208
+    },
+    "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1380352
+    },
+    "/model/layers.19/post_attention_layernorm/output_3.out4_38": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1386496
+    },
+    "/model/layers.19/post_attention_layernorm/output_0.out4_38": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1392640
+    },
+    "/model/layers.19/mlp/Mul/output_0.out3_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1398784
+    },
+    "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1415168
+    },
+    "/model/layers.20/input_layernorm/output_3.out4_39": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1421312
+    },
+    "/model/layers.20/input_layernorm/output_0.out4_39": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1427456
+    },
+    "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1433600
+    },
+    "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1445888
+    },
+    "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1452032
+    },
+    "/model/layers.20/post_attention_layernorm/output_3.out4_40": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1458176
+    },
+    "/model/layers.20/post_attention_layernorm/output_0.out4_40": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1464320
+    },
+    "/model/layers.20/mlp/Mul/output_0.out3_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1470464
+    },
+    "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1486848
+    },
+    "/model/layers.21/input_layernorm/output_3.out4_41": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1492992
+    },
+    "/model/layers.21/input_layernorm/output_0.out4_41": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1499136
+    },
+    "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1505280
+    },
+    "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1517568
+    },
+    "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1523712
+    },
+    "/model/layers.21/post_attention_layernorm/output_3.out4_42": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1529856
+    },
+    "/model/layers.21/post_attention_layernorm/output_0.out4_42": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1536000
+    },
+    "/model/layers.21/mlp/Mul/output_0.out3_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1542144
+    },
+    "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1558528
+    },
+    "/model/layers.22/input_layernorm/output_3.out4_43": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1564672
+    },
+    "/model/layers.22/input_layernorm/output_0.out4_43": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1570816
+    },
+    "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1576960
+    },
+    "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1589248
+    },
+    "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1595392
+    },
+    "/model/layers.22/post_attention_layernorm/output_3.out4_44": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1601536
+    },
+    "/model/layers.22/post_attention_layernorm/output_0.out4_44": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1607680
+    },
+    "/model/layers.22/mlp/Mul/output_0.out3_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1613824
+    },
+    "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1630208
+    },
+    "/model/layers.23/input_layernorm/output_3.out4_45": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1636352
+    },
+    "/model/layers.23/input_layernorm/output_0.out4_45": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1642496
+    },
+    "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1648640
+    },
+    "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1660928
+    },
+    "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1667072
+    },
+    "/model/layers.23/post_attention_layernorm/output_3.out4_46": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1673216
+    },
+    "/model/layers.23/post_attention_layernorm/output_0.out4_46": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1679360
+    },
+    "/model/layers.23/mlp/Mul/output_0.out3_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1685504
+    },
+    "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1701888
+    },
+    "/model/layers.24/input_layernorm/output_3.out4_47": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1708032
+    },
+    "/model/layers.24/input_layernorm/output_0.out4_47": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1714176
+    },
+    "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1720320
+    },
+    "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1732608
+    },
+    "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1738752
+    },
+    "/model/layers.24/post_attention_layernorm/output_3.out4_48": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1744896
+    },
+    "/model/layers.24/post_attention_layernorm/output_0.out4_48": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1751040
+    },
+    "/model/layers.24/mlp/Mul/output_0.out3_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1757184
+    },
+    "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1773568
+    },
+    "/model/layers.25/input_layernorm/output_3.out4_49": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1779712
+    },
+    "/model/layers.25/input_layernorm/output_0.out4_49": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1785856
+    },
+    "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1792000
+    },
+    "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1804288
+    },
+    "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1810432
+    },
+    "/model/layers.25/post_attention_layernorm/output_3.out4_50": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1816576
+    },
+    "/model/layers.25/post_attention_layernorm/output_0.out4_50": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1822720
+    },
+    "/model/layers.25/mlp/Mul/output_0.out3_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1828864
+    },
+    "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1845248
+    },
+    "/model/layers.26/input_layernorm/output_3.out4_51": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1851392
+    },
+    "/model/layers.26/input_layernorm/output_0.out4_51": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1857536
+    },
+    "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1863680
+    },
+    "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1875968
+    },
+    "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1882112
+    },
+    "/model/layers.26/post_attention_layernorm/output_3.out4_52": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1888256
+    },
+    "/model/layers.26/post_attention_layernorm/output_0.out4_52": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1894400
+    },
+    "/model/layers.26/mlp/Mul/output_0.out3_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1900544
+    },
+    "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1916928
+    },
+    "/model/layers.27/input_layernorm/output_3.out4_53": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1923072
+    },
+    "/model/layers.27/input_layernorm/output_0.out4_53": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1929216
+    },
+    "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1935360
+    },
+    "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1947648
+    },
+    "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1953792
+    },
+    "/model/layers.27/post_attention_layernorm/output_3.out4_54": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1959936
+    },
+    "/model/layers.27/post_attention_layernorm/output_0.out4_54": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1966080
+    },
+    "/model/layers.27/mlp/Mul/output_0.out3_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1972224
+    },
+    "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1988608
+    },
+    "/model/layers.28/input_layernorm/output_3.out4_55": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1994752
+    },
+    "/model/layers.28/input_layernorm/output_0.out4_55": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2000896
+    },
+    "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2007040
+    },
+    "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2019328
+    },
+    "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2025472
+    },
+    "/model/layers.28/post_attention_layernorm/output_3.out4_56": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2031616
+    },
+    "/model/layers.28/post_attention_layernorm/output_0.out4_56": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2037760
+    },
+    "/model/layers.28/mlp/Mul/output_0.out3_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2043904
+    },
+    "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2060288
+    },
+    "/model/layers.29/input_layernorm/output_3.out4_57": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2066432
+    },
+    "/model/layers.29/input_layernorm/output_0.out4_57": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2072576
+    },
+    "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2078720
+    },
+    "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2091008
+    },
+    "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2097152
+    },
+    "/model/layers.29/post_attention_layernorm/output_3.out4_58": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2103296
+    },
+    "/model/layers.29/post_attention_layernorm/output_0.out4_58": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2109440
+    },
+    "/model/layers.29/mlp/Mul/output_0.out3_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2115584
+    },
+    "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2131968
+    },
+    "/model/layers.30/input_layernorm/output_3.out4_59": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2138112
+    },
+    "/model/layers.30/input_layernorm/output_0.out4_59": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2144256
+    },
+    "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2150400
+    },
+    "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2162688
+    },
+    "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2168832
+    },
+    "/model/layers.30/post_attention_layernorm/output_3.out4_60": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2174976
+    },
+    "/model/layers.30/post_attention_layernorm/output_0.out4_60": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2181120
+    },
+    "/model/layers.30/mlp/Mul/output_0.out3_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2187264
+    },
+    "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2203648
+    },
+    "/model/layers.31/input_layernorm/output_3.out4_61": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2209792
+    },
+    "/model/layers.31/input_layernorm/output_0.out4_61": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2215936
+    },
+    "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        6144
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2222080
+    },
+    "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2234368
+    },
+    "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2240512
+    },
+    "/model/layers.31/post_attention_layernorm/output_3.out4_62": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2246656
+    },
+    "/model/layers.31/post_attention_layernorm/output_0.out4_62": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2252800
+    },
+    "/model/layers.31/mlp/Mul/output_0.out3_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        8192
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2258944
+    },
+    "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2275328
+    },
+    "/model/layers.32/final_norm_layernorm/output_0.out4_63": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2281472
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 0,
+      "file_name": ".cache\\MatMulNBits_2_0_0.const",
+      "file_size": 18874368
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 18874368,
+      "file_name": ".cache\\MatMulNBits_2_0_1.const",
+      "file_size": 24576
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 18898944,
+      "file_name": ".cache\\MatMulNBits_2_0_2.const",
+      "file_size": 589824
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 19488768,
+      "file_name": ".cache\\MatMulNBits_2_0_3.const",
+      "file_size": 147456
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 19636224,
+      "file_name": ".cache\\MatMulNBits_2_0_4.const",
+      "file_size": 9437184
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 29073408,
+      "file_name": ".cache\\MatMulNBits_2_0_5.const",
+      "file_size": 12288
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 29085696,
+      "file_name": ".cache\\MatMulNBits_2_0_6.const",
+      "file_size": 294912
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 29380608,
+      "file_name": ".cache\\MatMulNBits_2_0_7.const",
+      "file_size": 73728
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 29454336,
+      "file_name": ".cache\\MatMulNBits_2_0_8.const",
+      "file_size": 9437184
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 38891520,
+      "file_name": ".cache\\MatMulNBits_2_0_9.const",
+      "file_size": 12288
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 38903808,
+      "file_name": ".cache\\MatMulNBits_2_0_10.const",
+      "file_size": 294912
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 39198720,
+      "file_name": ".cache\\MatMulNBits_2_0_11.const",
+      "file_size": 73728
+    },
+    "model.layers.0.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 39272448,
+      "file_name": ".cache\\MatMulNBits_2_0_12.const",
+      "file_size": 6144
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 39278592,
+      "file_name": ".cache\\MatMulNBits_2_0_13.const",
+      "file_size": 12582912
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 51861504,
+      "file_name": ".cache\\MatMulNBits_2_0_14.const",
+      "file_size": 786432
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 52647936,
+      "file_name": ".cache\\MatMulNBits_2_0_15.const",
+      "file_size": 98304
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 52746240,
+      "file_name": ".cache\\MatMulNBits_2_0_16.const",
+      "file_size": 32768
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 52779008,
+      "file_name": ".cache\\MatMulNBits_2_0_17.const",
+      "file_size": 12582912
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 65361920,
+      "file_name": ".cache\\MatMulNBits_2_0_18.const",
+      "file_size": 786432
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 66148352,
+      "file_name": ".cache\\MatMulNBits_2_0_19.const",
+      "file_size": 98304
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 66246656,
+      "file_name": ".cache\\MatMulNBits_2_0_20.const",
+      "file_size": 32768
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 66279424,
+      "file_name": ".cache\\MatMulNBits_2_0_21.const",
+      "file_size": 25165824
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 91445248,
+      "file_name": ".cache\\MatMulNBits_2_0_22.const",
+      "file_size": 12288
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 91457536,
+      "file_name": ".cache\\MatMulNBits_2_0_23.const",
+      "file_size": 786432
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 92243968,
+      "file_name": ".cache\\MatMulNBits_2_0_24.const",
+      "file_size": 196608
+    },
+    "model.layers.1.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 92440576,
+      "file_name": ".cache\\MatMulNBits_2_0_25.const",
+      "file_size": 6144
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 92446720,
+      "file_name": ".cache\\MatMulNBits_2_0_26.const",
+      "file_size": 18874368
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 111321088,
+      "file_name": ".cache\\MatMulNBits_2_0_27.const",
+      "file_size": 24576
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 111345664,
+      "file_name": ".cache\\MatMulNBits_2_0_28.const",
+      "file_size": 589824
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 111935488,
+      "file_name": ".cache\\MatMulNBits_2_0_29.const",
+      "file_size": 147456
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 112082944,
+      "file_name": ".cache\\MatMulNBits_2_0_30.const",
+      "file_size": 9437184
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 121520128,
+      "file_name": ".cache\\MatMulNBits_2_0_31.const",
+      "file_size": 12288
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 121532416,
+      "file_name": ".cache\\MatMulNBits_2_0_32.const",
+      "file_size": 294912
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 121827328,
+      "file_name": ".cache\\MatMulNBits_2_0_33.const",
+      "file_size": 73728
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 121901056,
+      "file_name": ".cache\\MatMulNBits_2_0_34.const",
+      "file_size": 9437184
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 131338240,
+      "file_name": ".cache\\MatMulNBits_2_0_35.const",
+      "file_size": 12288
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 131350528,
+      "file_name": ".cache\\MatMulNBits_2_0_36.const",
+      "file_size": 294912
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 131645440,
+      "file_name": ".cache\\MatMulNBits_2_0_37.const",
+      "file_size": 73728
+    },
+    "model.layers.1.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 131719168,
+      "file_name": ".cache\\MatMulNBits_2_0_38.const",
+      "file_size": 6144
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 131725312,
+      "file_name": ".cache\\MatMulNBits_2_0_39.const",
+      "file_size": 12582912
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 144308224,
+      "file_name": ".cache\\MatMulNBits_2_0_40.const",
+      "file_size": 786432
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 145094656,
+      "file_name": ".cache\\MatMulNBits_2_0_41.const",
+      "file_size": 98304
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 145192960,
+      "file_name": ".cache\\MatMulNBits_2_0_42.const",
+      "file_size": 32768
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 145225728,
+      "file_name": ".cache\\MatMulNBits_2_0_43.const",
+      "file_size": 12582912
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 157808640,
+      "file_name": ".cache\\MatMulNBits_2_0_44.const",
+      "file_size": 786432
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 158595072,
+      "file_name": ".cache\\MatMulNBits_2_0_45.const",
+      "file_size": 98304
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 158693376,
+      "file_name": ".cache\\MatMulNBits_2_0_46.const",
+      "file_size": 32768
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 158726144,
+      "file_name": ".cache\\MatMulNBits_2_0_47.const",
+      "file_size": 25165824
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 183891968,
+      "file_name": ".cache\\MatMulNBits_2_0_48.const",
+      "file_size": 12288
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 183904256,
+      "file_name": ".cache\\MatMulNBits_2_0_49.const",
+      "file_size": 786432
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 184690688,
+      "file_name": ".cache\\MatMulNBits_2_0_50.const",
+      "file_size": 196608
+    },
+    "model.layers.2.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 184887296,
+      "file_name": ".cache\\MatMulNBits_2_0_51.const",
+      "file_size": 6144
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 184893440,
+      "file_name": ".cache\\MatMulNBits_2_0_52.const",
+      "file_size": 18874368
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 203767808,
+      "file_name": ".cache\\MatMulNBits_2_0_53.const",
+      "file_size": 24576
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 203792384,
+      "file_name": ".cache\\MatMulNBits_2_0_54.const",
+      "file_size": 589824
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 204382208,
+      "file_name": ".cache\\MatMulNBits_2_0_55.const",
+      "file_size": 147456
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 204529664,
+      "file_name": ".cache\\MatMulNBits_2_0_56.const",
+      "file_size": 9437184
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 213966848,
+      "file_name": ".cache\\MatMulNBits_2_0_57.const",
+      "file_size": 12288
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 213979136,
+      "file_name": ".cache\\MatMulNBits_2_0_58.const",
+      "file_size": 294912
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 214274048,
+      "file_name": ".cache\\MatMulNBits_2_0_59.const",
+      "file_size": 73728
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 214347776,
+      "file_name": ".cache\\MatMulNBits_2_0_60.const",
+      "file_size": 9437184
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 223784960,
+      "file_name": ".cache\\MatMulNBits_2_0_61.const",
+      "file_size": 12288
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 223797248,
+      "file_name": ".cache\\MatMulNBits_2_0_62.const",
+      "file_size": 294912
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 224092160,
+      "file_name": ".cache\\MatMulNBits_2_0_63.const",
+      "file_size": 73728
+    },
+    "model.layers.2.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 224165888,
+      "file_name": ".cache\\MatMulNBits_2_0_64.const",
+      "file_size": 6144
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 224172032,
+      "file_name": ".cache\\MatMulNBits_2_0_65.const",
+      "file_size": 12582912
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 236754944,
+      "file_name": ".cache\\MatMulNBits_2_0_66.const",
+      "file_size": 786432
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 237541376,
+      "file_name": ".cache\\MatMulNBits_2_0_67.const",
+      "file_size": 98304
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 237639680,
+      "file_name": ".cache\\MatMulNBits_2_0_68.const",
+      "file_size": 32768
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 237672448,
+      "file_name": ".cache\\MatMulNBits_2_0_69.const",
+      "file_size": 12582912
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 250255360,
+      "file_name": ".cache\\MatMulNBits_2_0_70.const",
+      "file_size": 786432
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 251041792,
+      "file_name": ".cache\\MatMulNBits_2_0_71.const",
+      "file_size": 98304
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 251140096,
+      "file_name": ".cache\\MatMulNBits_2_0_72.const",
+      "file_size": 32768
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 251172864,
+      "file_name": ".cache\\MatMulNBits_2_0_73.const",
+      "file_size": 25165824
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 276338688,
+      "file_name": ".cache\\MatMulNBits_2_0_74.const",
+      "file_size": 12288
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 276350976,
+      "file_name": ".cache\\MatMulNBits_2_0_75.const",
+      "file_size": 786432
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 277137408,
+      "file_name": ".cache\\MatMulNBits_2_0_76.const",
+      "file_size": 196608
+    },
+    "model.layers.3.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 277334016,
+      "file_name": ".cache\\MatMulNBits_2_0_77.const",
+      "file_size": 6144
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 277340160,
+      "file_name": ".cache\\MatMulNBits_2_0_78.const",
+      "file_size": 18874368
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 296214528,
+      "file_name": ".cache\\MatMulNBits_2_0_79.const",
+      "file_size": 24576
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 296239104,
+      "file_name": ".cache\\MatMulNBits_2_0_80.const",
+      "file_size": 589824
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 296828928,
+      "file_name": ".cache\\MatMulNBits_2_0_81.const",
+      "file_size": 147456
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 296976384,
+      "file_name": ".cache\\MatMulNBits_2_0_82.const",
+      "file_size": 9437184
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 306413568,
+      "file_name": ".cache\\MatMulNBits_2_0_83.const",
+      "file_size": 12288
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 306425856,
+      "file_name": ".cache\\MatMulNBits_2_0_84.const",
+      "file_size": 294912
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 306720768,
+      "file_name": ".cache\\MatMulNBits_2_0_85.const",
+      "file_size": 73728
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 306794496,
+      "file_name": ".cache\\MatMulNBits_2_0_86.const",
+      "file_size": 9437184
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 316231680,
+      "file_name": ".cache\\MatMulNBits_2_0_87.const",
+      "file_size": 12288
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 316243968,
+      "file_name": ".cache\\MatMulNBits_2_0_88.const",
+      "file_size": 294912
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 316538880,
+      "file_name": ".cache\\MatMulNBits_2_0_89.const",
+      "file_size": 73728
+    },
+    "model.layers.3.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 316612608,
+      "file_name": ".cache\\MatMulNBits_2_0_90.const",
+      "file_size": 6144
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 316618752,
+      "file_name": ".cache\\MatMulNBits_2_0_91.const",
+      "file_size": 12582912
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 329201664,
+      "file_name": ".cache\\MatMulNBits_2_0_92.const",
+      "file_size": 786432
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 329988096,
+      "file_name": ".cache\\MatMulNBits_2_0_93.const",
+      "file_size": 98304
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 330086400,
+      "file_name": ".cache\\MatMulNBits_2_0_94.const",
+      "file_size": 32768
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 330119168,
+      "file_name": ".cache\\MatMulNBits_2_0_95.const",
+      "file_size": 12582912
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 342702080,
+      "file_name": ".cache\\MatMulNBits_2_0_96.const",
+      "file_size": 786432
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 343488512,
+      "file_name": ".cache\\MatMulNBits_2_0_97.const",
+      "file_size": 98304
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 343586816,
+      "file_name": ".cache\\MatMulNBits_2_0_98.const",
+      "file_size": 32768
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 343619584,
+      "file_name": ".cache\\MatMulNBits_2_0_99.const",
+      "file_size": 25165824
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 368785408,
+      "file_name": ".cache\\MatMulNBits_2_0_100.const",
+      "file_size": 12288
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 368797696,
+      "file_name": ".cache\\MatMulNBits_2_0_101.const",
+      "file_size": 786432
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 369584128,
+      "file_name": ".cache\\MatMulNBits_2_0_102.const",
+      "file_size": 196608
+    },
+    "model.layers.4.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 369780736,
+      "file_name": ".cache\\MatMulNBits_2_0_103.const",
+      "file_size": 6144
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 369786880,
+      "file_name": ".cache\\MatMulNBits_2_0_104.const",
+      "file_size": 18874368
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 388661248,
+      "file_name": ".cache\\MatMulNBits_2_0_105.const",
+      "file_size": 24576
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 388685824,
+      "file_name": ".cache\\MatMulNBits_2_0_106.const",
+      "file_size": 589824
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 389275648,
+      "file_name": ".cache\\MatMulNBits_2_0_107.const",
+      "file_size": 147456
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 389423104,
+      "file_name": ".cache\\MatMulNBits_2_0_108.const",
+      "file_size": 9437184
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 398860288,
+      "file_name": ".cache\\MatMulNBits_2_0_109.const",
+      "file_size": 12288
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 398872576,
+      "file_name": ".cache\\MatMulNBits_2_0_110.const",
+      "file_size": 294912
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 399167488,
+      "file_name": ".cache\\MatMulNBits_2_0_111.const",
+      "file_size": 73728
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 399241216,
+      "file_name": ".cache\\MatMulNBits_2_0_112.const",
+      "file_size": 9437184
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 408678400,
+      "file_name": ".cache\\MatMulNBits_2_0_113.const",
+      "file_size": 12288
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 408690688,
+      "file_name": ".cache\\MatMulNBits_2_0_114.const",
+      "file_size": 294912
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 408985600,
+      "file_name": ".cache\\MatMulNBits_2_0_115.const",
+      "file_size": 73728
+    },
+    "model.layers.4.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 409059328,
+      "file_name": ".cache\\MatMulNBits_2_0_116.const",
+      "file_size": 6144
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 409065472,
+      "file_name": ".cache\\MatMulNBits_2_0_117.const",
+      "file_size": 12582912
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 421648384,
+      "file_name": ".cache\\MatMulNBits_2_0_118.const",
+      "file_size": 786432
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 422434816,
+      "file_name": ".cache\\MatMulNBits_2_0_119.const",
+      "file_size": 98304
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 422533120,
+      "file_name": ".cache\\MatMulNBits_2_0_120.const",
+      "file_size": 32768
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 422565888,
+      "file_name": ".cache\\MatMulNBits_2_0_121.const",
+      "file_size": 12582912
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 435148800,
+      "file_name": ".cache\\MatMulNBits_2_0_122.const",
+      "file_size": 786432
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 435935232,
+      "file_name": ".cache\\MatMulNBits_2_0_123.const",
+      "file_size": 98304
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 436033536,
+      "file_name": ".cache\\MatMulNBits_2_0_124.const",
+      "file_size": 32768
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 436066304,
+      "file_name": ".cache\\MatMulNBits_2_0_125.const",
+      "file_size": 25165824
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 461232128,
+      "file_name": ".cache\\MatMulNBits_2_0_126.const",
+      "file_size": 12288
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 461244416,
+      "file_name": ".cache\\MatMulNBits_2_0_127.const",
+      "file_size": 786432
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 462030848,
+      "file_name": ".cache\\MatMulNBits_2_0_128.const",
+      "file_size": 196608
+    },
+    "model.layers.5.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 462227456,
+      "file_name": ".cache\\MatMulNBits_2_0_129.const",
+      "file_size": 6144
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 462233600,
+      "file_name": ".cache\\MatMulNBits_2_0_130.const",
+      "file_size": 18874368
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 481107968,
+      "file_name": ".cache\\MatMulNBits_2_0_131.const",
+      "file_size": 24576
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 481132544,
+      "file_name": ".cache\\MatMulNBits_2_0_132.const",
+      "file_size": 589824
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 481722368,
+      "file_name": ".cache\\MatMulNBits_2_0_133.const",
+      "file_size": 147456
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 481869824,
+      "file_name": ".cache\\MatMulNBits_2_0_134.const",
+      "file_size": 9437184
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 491307008,
+      "file_name": ".cache\\MatMulNBits_2_0_135.const",
+      "file_size": 12288
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 491319296,
+      "file_name": ".cache\\MatMulNBits_2_0_136.const",
+      "file_size": 294912
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 491614208,
+      "file_name": ".cache\\MatMulNBits_2_0_137.const",
+      "file_size": 73728
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 491687936,
+      "file_name": ".cache\\MatMulNBits_2_0_138.const",
+      "file_size": 9437184
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 501125120,
+      "file_name": ".cache\\MatMulNBits_2_0_139.const",
+      "file_size": 12288
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 501137408,
+      "file_name": ".cache\\MatMulNBits_2_0_140.const",
+      "file_size": 294912
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 501432320,
+      "file_name": ".cache\\MatMulNBits_2_0_141.const",
+      "file_size": 73728
+    },
+    "model.layers.5.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 501506048,
+      "file_name": ".cache\\MatMulNBits_2_0_142.const",
+      "file_size": 6144
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 501512192,
+      "file_name": ".cache\\MatMulNBits_2_0_143.const",
+      "file_size": 12582912
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 514095104,
+      "file_name": ".cache\\MatMulNBits_2_0_144.const",
+      "file_size": 786432
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 514881536,
+      "file_name": ".cache\\MatMulNBits_2_0_145.const",
+      "file_size": 98304
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 514979840,
+      "file_name": ".cache\\MatMulNBits_2_0_146.const",
+      "file_size": 32768
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 515012608,
+      "file_name": ".cache\\MatMulNBits_2_0_147.const",
+      "file_size": 12582912
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 527595520,
+      "file_name": ".cache\\MatMulNBits_2_0_148.const",
+      "file_size": 786432
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 528381952,
+      "file_name": ".cache\\MatMulNBits_2_0_149.const",
+      "file_size": 98304
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 528480256,
+      "file_name": ".cache\\MatMulNBits_2_0_150.const",
+      "file_size": 32768
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 528513024,
+      "file_name": ".cache\\MatMulNBits_2_0_151.const",
+      "file_size": 25165824
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 553678848,
+      "file_name": ".cache\\MatMulNBits_2_0_152.const",
+      "file_size": 12288
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 553691136,
+      "file_name": ".cache\\MatMulNBits_2_0_153.const",
+      "file_size": 786432
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 554477568,
+      "file_name": ".cache\\MatMulNBits_2_0_154.const",
+      "file_size": 196608
+    },
+    "model.layers.6.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 554674176,
+      "file_name": ".cache\\MatMulNBits_2_0_155.const",
+      "file_size": 6144
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 554680320,
+      "file_name": ".cache\\MatMulNBits_2_0_156.const",
+      "file_size": 18874368
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 573554688,
+      "file_name": ".cache\\MatMulNBits_2_0_157.const",
+      "file_size": 24576
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 573579264,
+      "file_name": ".cache\\MatMulNBits_2_0_158.const",
+      "file_size": 589824
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 574169088,
+      "file_name": ".cache\\MatMulNBits_2_0_159.const",
+      "file_size": 147456
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 574316544,
+      "file_name": ".cache\\MatMulNBits_2_0_160.const",
+      "file_size": 9437184
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 583753728,
+      "file_name": ".cache\\MatMulNBits_2_0_161.const",
+      "file_size": 12288
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 583766016,
+      "file_name": ".cache\\MatMulNBits_2_0_162.const",
+      "file_size": 294912
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 584060928,
+      "file_name": ".cache\\MatMulNBits_2_0_163.const",
+      "file_size": 73728
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 584134656,
+      "file_name": ".cache\\MatMulNBits_2_0_164.const",
+      "file_size": 9437184
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 593571840,
+      "file_name": ".cache\\MatMulNBits_2_0_165.const",
+      "file_size": 12288
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 593584128,
+      "file_name": ".cache\\MatMulNBits_2_0_166.const",
+      "file_size": 294912
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 593879040,
+      "file_name": ".cache\\MatMulNBits_2_0_167.const",
+      "file_size": 73728
+    },
+    "model.layers.6.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 593952768,
+      "file_name": ".cache\\MatMulNBits_2_0_168.const",
+      "file_size": 6144
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 593958912,
+      "file_name": ".cache\\MatMulNBits_2_0_169.const",
+      "file_size": 12582912
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 606541824,
+      "file_name": ".cache\\MatMulNBits_2_0_170.const",
+      "file_size": 786432
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 607328256,
+      "file_name": ".cache\\MatMulNBits_2_0_171.const",
+      "file_size": 98304
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 607426560,
+      "file_name": ".cache\\MatMulNBits_2_0_172.const",
+      "file_size": 32768
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 607459328,
+      "file_name": ".cache\\MatMulNBits_2_0_173.const",
+      "file_size": 12582912
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 620042240,
+      "file_name": ".cache\\MatMulNBits_2_0_174.const",
+      "file_size": 786432
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 620828672,
+      "file_name": ".cache\\MatMulNBits_2_0_175.const",
+      "file_size": 98304
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 620926976,
+      "file_name": ".cache\\MatMulNBits_2_0_176.const",
+      "file_size": 32768
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 620959744,
+      "file_name": ".cache\\MatMulNBits_2_0_177.const",
+      "file_size": 25165824
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 646125568,
+      "file_name": ".cache\\MatMulNBits_2_0_178.const",
+      "file_size": 12288
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 646137856,
+      "file_name": ".cache\\MatMulNBits_2_0_179.const",
+      "file_size": 786432
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 646924288,
+      "file_name": ".cache\\MatMulNBits_2_0_180.const",
+      "file_size": 196608
+    },
+    "model.layers.7.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 647120896,
+      "file_name": ".cache\\MatMulNBits_2_0_181.const",
+      "file_size": 6144
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 647127040,
+      "file_name": ".cache\\MatMulNBits_2_0_182.const",
+      "file_size": 18874368
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 666001408,
+      "file_name": ".cache\\MatMulNBits_2_0_183.const",
+      "file_size": 24576
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 666025984,
+      "file_name": ".cache\\MatMulNBits_2_0_184.const",
+      "file_size": 589824
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 666615808,
+      "file_name": ".cache\\MatMulNBits_2_0_185.const",
+      "file_size": 147456
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 666763264,
+      "file_name": ".cache\\MatMulNBits_2_0_186.const",
+      "file_size": 9437184
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 676200448,
+      "file_name": ".cache\\MatMulNBits_2_0_187.const",
+      "file_size": 12288
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 676212736,
+      "file_name": ".cache\\MatMulNBits_2_0_188.const",
+      "file_size": 294912
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 676507648,
+      "file_name": ".cache\\MatMulNBits_2_0_189.const",
+      "file_size": 73728
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 676581376,
+      "file_name": ".cache\\MatMulNBits_2_0_190.const",
+      "file_size": 9437184
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 686018560,
+      "file_name": ".cache\\MatMulNBits_2_0_191.const",
+      "file_size": 12288
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 686030848,
+      "file_name": ".cache\\MatMulNBits_2_0_192.const",
+      "file_size": 294912
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 686325760,
+      "file_name": ".cache\\MatMulNBits_2_0_193.const",
+      "file_size": 73728
+    },
+    "model.layers.7.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 686399488,
+      "file_name": ".cache\\MatMulNBits_2_0_194.const",
+      "file_size": 6144
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 686405632,
+      "file_name": ".cache\\MatMulNBits_2_0_195.const",
+      "file_size": 12582912
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 698988544,
+      "file_name": ".cache\\MatMulNBits_2_0_196.const",
+      "file_size": 786432
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 699774976,
+      "file_name": ".cache\\MatMulNBits_2_0_197.const",
+      "file_size": 98304
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 699873280,
+      "file_name": ".cache\\MatMulNBits_2_0_198.const",
+      "file_size": 32768
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 699906048,
+      "file_name": ".cache\\MatMulNBits_2_0_199.const",
+      "file_size": 12582912
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 712488960,
+      "file_name": ".cache\\MatMulNBits_2_0_200.const",
+      "file_size": 786432
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 713275392,
+      "file_name": ".cache\\MatMulNBits_2_0_201.const",
+      "file_size": 98304
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 713373696,
+      "file_name": ".cache\\MatMulNBits_2_0_202.const",
+      "file_size": 32768
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 713406464,
+      "file_name": ".cache\\MatMulNBits_2_0_203.const",
+      "file_size": 25165824
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 738572288,
+      "file_name": ".cache\\MatMulNBits_2_0_204.const",
+      "file_size": 12288
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 738584576,
+      "file_name": ".cache\\MatMulNBits_2_0_205.const",
+      "file_size": 786432
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 739371008,
+      "file_name": ".cache\\MatMulNBits_2_0_206.const",
+      "file_size": 196608
+    },
+    "model.layers.8.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 739567616,
+      "file_name": ".cache\\MatMulNBits_2_0_207.const",
+      "file_size": 6144
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 739573760,
+      "file_name": ".cache\\MatMulNBits_2_0_208.const",
+      "file_size": 18874368
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 758448128,
+      "file_name": ".cache\\MatMulNBits_2_0_209.const",
+      "file_size": 24576
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 758472704,
+      "file_name": ".cache\\MatMulNBits_2_0_210.const",
+      "file_size": 589824
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 759062528,
+      "file_name": ".cache\\MatMulNBits_2_0_211.const",
+      "file_size": 147456
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 759209984,
+      "file_name": ".cache\\MatMulNBits_2_0_212.const",
+      "file_size": 9437184
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 768647168,
+      "file_name": ".cache\\MatMulNBits_2_0_213.const",
+      "file_size": 12288
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 768659456,
+      "file_name": ".cache\\MatMulNBits_2_0_214.const",
+      "file_size": 294912
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 768954368,
+      "file_name": ".cache\\MatMulNBits_2_0_215.const",
+      "file_size": 73728
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 769028096,
+      "file_name": ".cache\\MatMulNBits_2_0_216.const",
+      "file_size": 9437184
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 778465280,
+      "file_name": ".cache\\MatMulNBits_2_0_217.const",
+      "file_size": 12288
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 778477568,
+      "file_name": ".cache\\MatMulNBits_2_0_218.const",
+      "file_size": 294912
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 778772480,
+      "file_name": ".cache\\MatMulNBits_2_0_219.const",
+      "file_size": 73728
+    },
+    "model.layers.8.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 778846208,
+      "file_name": ".cache\\MatMulNBits_2_0_220.const",
+      "file_size": 6144
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 778852352,
+      "file_name": ".cache\\MatMulNBits_2_0_221.const",
+      "file_size": 12582912
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 791435264,
+      "file_name": ".cache\\MatMulNBits_2_0_222.const",
+      "file_size": 786432
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 792221696,
+      "file_name": ".cache\\MatMulNBits_2_0_223.const",
+      "file_size": 98304
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 792320000,
+      "file_name": ".cache\\MatMulNBits_2_0_224.const",
+      "file_size": 32768
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 792352768,
+      "file_name": ".cache\\MatMulNBits_2_0_225.const",
+      "file_size": 12582912
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 804935680,
+      "file_name": ".cache\\MatMulNBits_2_0_226.const",
+      "file_size": 786432
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 805722112,
+      "file_name": ".cache\\MatMulNBits_2_0_227.const",
+      "file_size": 98304
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 805820416,
+      "file_name": ".cache\\MatMulNBits_2_0_228.const",
+      "file_size": 32768
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 805853184,
+      "file_name": ".cache\\MatMulNBits_2_0_229.const",
+      "file_size": 25165824
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 831019008,
+      "file_name": ".cache\\MatMulNBits_2_0_230.const",
+      "file_size": 12288
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 831031296,
+      "file_name": ".cache\\MatMulNBits_2_0_231.const",
+      "file_size": 786432
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 831817728,
+      "file_name": ".cache\\MatMulNBits_2_0_232.const",
+      "file_size": 196608
+    },
+    "model.layers.9.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 832014336,
+      "file_name": ".cache\\MatMulNBits_2_0_233.const",
+      "file_size": 6144
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 832020480,
+      "file_name": ".cache\\MatMulNBits_2_0_234.const",
+      "file_size": 18874368
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 850894848,
+      "file_name": ".cache\\MatMulNBits_2_0_235.const",
+      "file_size": 24576
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 850919424,
+      "file_name": ".cache\\MatMulNBits_2_0_236.const",
+      "file_size": 589824
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 851509248,
+      "file_name": ".cache\\MatMulNBits_2_0_237.const",
+      "file_size": 147456
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 851656704,
+      "file_name": ".cache\\MatMulNBits_2_0_238.const",
+      "file_size": 9437184
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 861093888,
+      "file_name": ".cache\\MatMulNBits_2_0_239.const",
+      "file_size": 12288
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 861106176,
+      "file_name": ".cache\\MatMulNBits_2_0_240.const",
+      "file_size": 294912
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 861401088,
+      "file_name": ".cache\\MatMulNBits_2_0_241.const",
+      "file_size": 73728
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 861474816,
+      "file_name": ".cache\\MatMulNBits_2_0_242.const",
+      "file_size": 9437184
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 870912000,
+      "file_name": ".cache\\MatMulNBits_2_0_243.const",
+      "file_size": 12288
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 870924288,
+      "file_name": ".cache\\MatMulNBits_2_0_244.const",
+      "file_size": 294912
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 871219200,
+      "file_name": ".cache\\MatMulNBits_2_0_245.const",
+      "file_size": 73728
+    },
+    "model.layers.9.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 871292928,
+      "file_name": ".cache\\MatMulNBits_2_0_246.const",
+      "file_size": 6144
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 871299072,
+      "file_name": ".cache\\MatMulNBits_2_0_247.const",
+      "file_size": 12582912
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 883881984,
+      "file_name": ".cache\\MatMulNBits_2_0_248.const",
+      "file_size": 786432
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 884668416,
+      "file_name": ".cache\\MatMulNBits_2_0_249.const",
+      "file_size": 98304
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 884766720,
+      "file_name": ".cache\\MatMulNBits_2_0_250.const",
+      "file_size": 32768
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 884799488,
+      "file_name": ".cache\\MatMulNBits_2_0_251.const",
+      "file_size": 12582912
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 897382400,
+      "file_name": ".cache\\MatMulNBits_2_0_252.const",
+      "file_size": 786432
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 898168832,
+      "file_name": ".cache\\MatMulNBits_2_0_253.const",
+      "file_size": 98304
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 898267136,
+      "file_name": ".cache\\MatMulNBits_2_0_254.const",
+      "file_size": 32768
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 898299904,
+      "file_name": ".cache\\MatMulNBits_2_0_255.const",
+      "file_size": 25165824
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 923465728,
+      "file_name": ".cache\\MatMulNBits_2_0_256.const",
+      "file_size": 12288
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 923478016,
+      "file_name": ".cache\\MatMulNBits_2_0_257.const",
+      "file_size": 786432
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 924264448,
+      "file_name": ".cache\\MatMulNBits_2_0_258.const",
+      "file_size": 196608
+    },
+    "model.layers.10.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 924461056,
+      "file_name": ".cache\\MatMulNBits_2_0_259.const",
+      "file_size": 6144
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 924467200,
+      "file_name": ".cache\\MatMulNBits_2_0_260.const",
+      "file_size": 18874368
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 943341568,
+      "file_name": ".cache\\MatMulNBits_2_0_261.const",
+      "file_size": 24576
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 943366144,
+      "file_name": ".cache\\MatMulNBits_2_0_262.const",
+      "file_size": 589824
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 943955968,
+      "file_name": ".cache\\MatMulNBits_2_0_263.const",
+      "file_size": 147456
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 944103424,
+      "file_name": ".cache\\MatMulNBits_2_0_264.const",
+      "file_size": 9437184
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 953540608,
+      "file_name": ".cache\\MatMulNBits_2_0_265.const",
+      "file_size": 12288
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 953552896,
+      "file_name": ".cache\\MatMulNBits_2_0_266.const",
+      "file_size": 294912
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 953847808,
+      "file_name": ".cache\\MatMulNBits_2_0_267.const",
+      "file_size": 73728
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 953921536,
+      "file_name": ".cache\\MatMulNBits_2_0_268.const",
+      "file_size": 9437184
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 963358720,
+      "file_name": ".cache\\MatMulNBits_2_0_269.const",
+      "file_size": 12288
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 963371008,
+      "file_name": ".cache\\MatMulNBits_2_0_270.const",
+      "file_size": 294912
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 963665920,
+      "file_name": ".cache\\MatMulNBits_2_0_271.const",
+      "file_size": 73728
+    },
+    "model.layers.10.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 963739648,
+      "file_name": ".cache\\MatMulNBits_2_0_272.const",
+      "file_size": 6144
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 963745792,
+      "file_name": ".cache\\MatMulNBits_2_0_273.const",
+      "file_size": 12582912
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 976328704,
+      "file_name": ".cache\\MatMulNBits_2_0_274.const",
+      "file_size": 786432
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 977115136,
+      "file_name": ".cache\\MatMulNBits_2_0_275.const",
+      "file_size": 98304
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 977213440,
+      "file_name": ".cache\\MatMulNBits_2_0_276.const",
+      "file_size": 32768
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 977246208,
+      "file_name": ".cache\\MatMulNBits_2_0_277.const",
+      "file_size": 12582912
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 989829120,
+      "file_name": ".cache\\MatMulNBits_2_0_278.const",
+      "file_size": 786432
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 990615552,
+      "file_name": ".cache\\MatMulNBits_2_0_279.const",
+      "file_size": 98304
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 990713856,
+      "file_name": ".cache\\MatMulNBits_2_0_280.const",
+      "file_size": 32768
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 990746624,
+      "file_name": ".cache\\MatMulNBits_2_0_281.const",
+      "file_size": 25165824
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1015912448,
+      "file_name": ".cache\\MatMulNBits_2_0_282.const",
+      "file_size": 12288
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1015924736,
+      "file_name": ".cache\\MatMulNBits_2_0_283.const",
+      "file_size": 786432
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1016711168,
+      "file_name": ".cache\\MatMulNBits_2_0_284.const",
+      "file_size": 196608
+    },
+    "model.layers.11.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1016907776,
+      "file_name": ".cache\\MatMulNBits_2_0_285.const",
+      "file_size": 6144
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1016913920,
+      "file_name": ".cache\\MatMulNBits_2_0_286.const",
+      "file_size": 18874368
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1035788288,
+      "file_name": ".cache\\MatMulNBits_2_0_287.const",
+      "file_size": 24576
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1035812864,
+      "file_name": ".cache\\MatMulNBits_2_0_288.const",
+      "file_size": 589824
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1036402688,
+      "file_name": ".cache\\MatMulNBits_2_0_289.const",
+      "file_size": 147456
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1036550144,
+      "file_name": ".cache\\MatMulNBits_2_0_290.const",
+      "file_size": 9437184
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1045987328,
+      "file_name": ".cache\\MatMulNBits_2_0_291.const",
+      "file_size": 12288
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1045999616,
+      "file_name": ".cache\\MatMulNBits_2_0_292.const",
+      "file_size": 294912
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1046294528,
+      "file_name": ".cache\\MatMulNBits_2_0_293.const",
+      "file_size": 73728
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1046368256,
+      "file_name": ".cache\\MatMulNBits_2_0_294.const",
+      "file_size": 9437184
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1055805440,
+      "file_name": ".cache\\MatMulNBits_2_0_295.const",
+      "file_size": 12288
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1055817728,
+      "file_name": ".cache\\MatMulNBits_2_0_296.const",
+      "file_size": 294912
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1056112640,
+      "file_name": ".cache\\MatMulNBits_2_0_297.const",
+      "file_size": 73728
+    },
+    "model.layers.11.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1056186368,
+      "file_name": ".cache\\MatMulNBits_2_0_298.const",
+      "file_size": 6144
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1056192512,
+      "file_name": ".cache\\MatMulNBits_2_0_299.const",
+      "file_size": 12582912
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1068775424,
+      "file_name": ".cache\\MatMulNBits_2_0_300.const",
+      "file_size": 786432
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1069561856,
+      "file_name": ".cache\\MatMulNBits_2_0_301.const",
+      "file_size": 98304
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1069660160,
+      "file_name": ".cache\\MatMulNBits_2_0_302.const",
+      "file_size": 32768
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1069692928,
+      "file_name": ".cache\\MatMulNBits_2_0_303.const",
+      "file_size": 12582912
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1082275840,
+      "file_name": ".cache\\MatMulNBits_2_0_304.const",
+      "file_size": 786432
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1083062272,
+      "file_name": ".cache\\MatMulNBits_2_0_305.const",
+      "file_size": 98304
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1083160576,
+      "file_name": ".cache\\MatMulNBits_2_0_306.const",
+      "file_size": 32768
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1083193344,
+      "file_name": ".cache\\MatMulNBits_2_0_307.const",
+      "file_size": 25165824
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1108359168,
+      "file_name": ".cache\\MatMulNBits_2_0_308.const",
+      "file_size": 12288
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1108371456,
+      "file_name": ".cache\\MatMulNBits_2_0_309.const",
+      "file_size": 786432
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1109157888,
+      "file_name": ".cache\\MatMulNBits_2_0_310.const",
+      "file_size": 196608
+    },
+    "model.layers.12.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1109354496,
+      "file_name": ".cache\\MatMulNBits_2_0_311.const",
+      "file_size": 6144
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1109360640,
+      "file_name": ".cache\\MatMulNBits_2_0_312.const",
+      "file_size": 18874368
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1128235008,
+      "file_name": ".cache\\MatMulNBits_2_0_313.const",
+      "file_size": 24576
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1128259584,
+      "file_name": ".cache\\MatMulNBits_2_0_314.const",
+      "file_size": 589824
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1128849408,
+      "file_name": ".cache\\MatMulNBits_2_0_315.const",
+      "file_size": 147456
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1128996864,
+      "file_name": ".cache\\MatMulNBits_2_0_316.const",
+      "file_size": 9437184
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1138434048,
+      "file_name": ".cache\\MatMulNBits_2_0_317.const",
+      "file_size": 12288
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1138446336,
+      "file_name": ".cache\\MatMulNBits_2_0_318.const",
+      "file_size": 294912
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1138741248,
+      "file_name": ".cache\\MatMulNBits_2_0_319.const",
+      "file_size": 73728
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1138814976,
+      "file_name": ".cache\\MatMulNBits_2_0_320.const",
+      "file_size": 9437184
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1148252160,
+      "file_name": ".cache\\MatMulNBits_2_0_321.const",
+      "file_size": 12288
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1148264448,
+      "file_name": ".cache\\MatMulNBits_2_0_322.const",
+      "file_size": 294912
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1148559360,
+      "file_name": ".cache\\MatMulNBits_2_0_323.const",
+      "file_size": 73728
+    },
+    "model.layers.12.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1148633088,
+      "file_name": ".cache\\MatMulNBits_2_0_324.const",
+      "file_size": 6144
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1148639232,
+      "file_name": ".cache\\MatMulNBits_2_0_325.const",
+      "file_size": 12582912
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1161222144,
+      "file_name": ".cache\\MatMulNBits_2_0_326.const",
+      "file_size": 786432
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1162008576,
+      "file_name": ".cache\\MatMulNBits_2_0_327.const",
+      "file_size": 98304
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1162106880,
+      "file_name": ".cache\\MatMulNBits_2_0_328.const",
+      "file_size": 32768
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1162139648,
+      "file_name": ".cache\\MatMulNBits_2_0_329.const",
+      "file_size": 12582912
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1174722560,
+      "file_name": ".cache\\MatMulNBits_2_0_330.const",
+      "file_size": 786432
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1175508992,
+      "file_name": ".cache\\MatMulNBits_2_0_331.const",
+      "file_size": 98304
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1175607296,
+      "file_name": ".cache\\MatMulNBits_2_0_332.const",
+      "file_size": 32768
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1175640064,
+      "file_name": ".cache\\MatMulNBits_2_0_333.const",
+      "file_size": 25165824
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1200805888,
+      "file_name": ".cache\\MatMulNBits_2_0_334.const",
+      "file_size": 12288
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1200818176,
+      "file_name": ".cache\\MatMulNBits_2_0_335.const",
+      "file_size": 786432
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1201604608,
+      "file_name": ".cache\\MatMulNBits_2_0_336.const",
+      "file_size": 196608
+    },
+    "model.layers.13.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1201801216,
+      "file_name": ".cache\\MatMulNBits_2_0_337.const",
+      "file_size": 6144
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1201807360,
+      "file_name": ".cache\\MatMulNBits_2_0_338.const",
+      "file_size": 18874368
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1220681728,
+      "file_name": ".cache\\MatMulNBits_2_0_339.const",
+      "file_size": 24576
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1220706304,
+      "file_name": ".cache\\MatMulNBits_2_0_340.const",
+      "file_size": 589824
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1221296128,
+      "file_name": ".cache\\MatMulNBits_2_0_341.const",
+      "file_size": 147456
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1221443584,
+      "file_name": ".cache\\MatMulNBits_2_0_342.const",
+      "file_size": 9437184
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1230880768,
+      "file_name": ".cache\\MatMulNBits_2_0_343.const",
+      "file_size": 12288
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1230893056,
+      "file_name": ".cache\\MatMulNBits_2_0_344.const",
+      "file_size": 294912
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1231187968,
+      "file_name": ".cache\\MatMulNBits_2_0_345.const",
+      "file_size": 73728
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1231261696,
+      "file_name": ".cache\\MatMulNBits_2_0_346.const",
+      "file_size": 9437184
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1240698880,
+      "file_name": ".cache\\MatMulNBits_2_0_347.const",
+      "file_size": 12288
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1240711168,
+      "file_name": ".cache\\MatMulNBits_2_0_348.const",
+      "file_size": 294912
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1241006080,
+      "file_name": ".cache\\MatMulNBits_2_0_349.const",
+      "file_size": 73728
+    },
+    "model.layers.13.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1241079808,
+      "file_name": ".cache\\MatMulNBits_2_0_350.const",
+      "file_size": 6144
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1241085952,
+      "file_name": ".cache\\MatMulNBits_2_0_351.const",
+      "file_size": 12582912
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1253668864,
+      "file_name": ".cache\\MatMulNBits_2_0_352.const",
+      "file_size": 786432
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1254455296,
+      "file_name": ".cache\\MatMulNBits_2_0_353.const",
+      "file_size": 98304
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1254553600,
+      "file_name": ".cache\\MatMulNBits_2_0_354.const",
+      "file_size": 32768
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1254586368,
+      "file_name": ".cache\\MatMulNBits_2_0_355.const",
+      "file_size": 12582912
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1267169280,
+      "file_name": ".cache\\MatMulNBits_2_0_356.const",
+      "file_size": 786432
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1267955712,
+      "file_name": ".cache\\MatMulNBits_2_0_357.const",
+      "file_size": 98304
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1268054016,
+      "file_name": ".cache\\MatMulNBits_2_0_358.const",
+      "file_size": 32768
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1268086784,
+      "file_name": ".cache\\MatMulNBits_2_0_359.const",
+      "file_size": 25165824
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1293252608,
+      "file_name": ".cache\\MatMulNBits_2_0_360.const",
+      "file_size": 12288
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1293264896,
+      "file_name": ".cache\\MatMulNBits_2_0_361.const",
+      "file_size": 786432
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1294051328,
+      "file_name": ".cache\\MatMulNBits_2_0_362.const",
+      "file_size": 196608
+    },
+    "model.layers.14.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1294247936,
+      "file_name": ".cache\\MatMulNBits_2_0_363.const",
+      "file_size": 6144
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1294254080,
+      "file_name": ".cache\\MatMulNBits_2_0_364.const",
+      "file_size": 18874368
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1313128448,
+      "file_name": ".cache\\MatMulNBits_2_0_365.const",
+      "file_size": 24576
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1313153024,
+      "file_name": ".cache\\MatMulNBits_2_0_366.const",
+      "file_size": 589824
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1313742848,
+      "file_name": ".cache\\MatMulNBits_2_0_367.const",
+      "file_size": 147456
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1313890304,
+      "file_name": ".cache\\MatMulNBits_2_0_368.const",
+      "file_size": 9437184
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1323327488,
+      "file_name": ".cache\\MatMulNBits_2_0_369.const",
+      "file_size": 12288
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1323339776,
+      "file_name": ".cache\\MatMulNBits_2_0_370.const",
+      "file_size": 294912
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1323634688,
+      "file_name": ".cache\\MatMulNBits_2_0_371.const",
+      "file_size": 73728
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1323708416,
+      "file_name": ".cache\\MatMulNBits_2_0_372.const",
+      "file_size": 9437184
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1333145600,
+      "file_name": ".cache\\MatMulNBits_2_0_373.const",
+      "file_size": 12288
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1333157888,
+      "file_name": ".cache\\MatMulNBits_2_0_374.const",
+      "file_size": 294912
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1333452800,
+      "file_name": ".cache\\MatMulNBits_2_0_375.const",
+      "file_size": 73728
+    },
+    "model.layers.14.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1333526528,
+      "file_name": ".cache\\MatMulNBits_2_0_376.const",
+      "file_size": 6144
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1333532672,
+      "file_name": ".cache\\MatMulNBits_2_0_377.const",
+      "file_size": 12582912
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1346115584,
+      "file_name": ".cache\\MatMulNBits_2_0_378.const",
+      "file_size": 786432
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1346902016,
+      "file_name": ".cache\\MatMulNBits_2_0_379.const",
+      "file_size": 98304
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1347000320,
+      "file_name": ".cache\\MatMulNBits_2_0_380.const",
+      "file_size": 32768
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1347033088,
+      "file_name": ".cache\\MatMulNBits_2_0_381.const",
+      "file_size": 12582912
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1359616000,
+      "file_name": ".cache\\MatMulNBits_2_0_382.const",
+      "file_size": 786432
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1360402432,
+      "file_name": ".cache\\MatMulNBits_2_0_383.const",
+      "file_size": 98304
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1360500736,
+      "file_name": ".cache\\MatMulNBits_2_0_384.const",
+      "file_size": 32768
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1360533504,
+      "file_name": ".cache\\MatMulNBits_2_0_385.const",
+      "file_size": 25165824
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1385699328,
+      "file_name": ".cache\\MatMulNBits_2_0_386.const",
+      "file_size": 12288
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1385711616,
+      "file_name": ".cache\\MatMulNBits_2_0_387.const",
+      "file_size": 786432
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1386498048,
+      "file_name": ".cache\\MatMulNBits_2_0_388.const",
+      "file_size": 196608
+    },
+    "model.layers.15.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1386694656,
+      "file_name": ".cache\\MatMulNBits_2_0_389.const",
+      "file_size": 6144
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1386700800,
+      "file_name": ".cache\\MatMulNBits_2_0_390.const",
+      "file_size": 18874368
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1405575168,
+      "file_name": ".cache\\MatMulNBits_2_0_391.const",
+      "file_size": 24576
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1405599744,
+      "file_name": ".cache\\MatMulNBits_2_0_392.const",
+      "file_size": 589824
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1406189568,
+      "file_name": ".cache\\MatMulNBits_2_0_393.const",
+      "file_size": 147456
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1406337024,
+      "file_name": ".cache\\MatMulNBits_2_0_394.const",
+      "file_size": 9437184
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1415774208,
+      "file_name": ".cache\\MatMulNBits_2_0_395.const",
+      "file_size": 12288
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1415786496,
+      "file_name": ".cache\\MatMulNBits_2_0_396.const",
+      "file_size": 294912
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1416081408,
+      "file_name": ".cache\\MatMulNBits_2_0_397.const",
+      "file_size": 73728
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1416155136,
+      "file_name": ".cache\\MatMulNBits_2_0_398.const",
+      "file_size": 9437184
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1425592320,
+      "file_name": ".cache\\MatMulNBits_2_0_399.const",
+      "file_size": 12288
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1425604608,
+      "file_name": ".cache\\MatMulNBits_2_0_400.const",
+      "file_size": 294912
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1425899520,
+      "file_name": ".cache\\MatMulNBits_2_0_401.const",
+      "file_size": 73728
+    },
+    "model.layers.15.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1425973248,
+      "file_name": ".cache\\MatMulNBits_2_0_402.const",
+      "file_size": 6144
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1425979392,
+      "file_name": ".cache\\MatMulNBits_2_0_403.const",
+      "file_size": 12582912
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1438562304,
+      "file_name": ".cache\\MatMulNBits_2_0_404.const",
+      "file_size": 786432
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1439348736,
+      "file_name": ".cache\\MatMulNBits_2_0_405.const",
+      "file_size": 98304
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1439447040,
+      "file_name": ".cache\\MatMulNBits_2_0_406.const",
+      "file_size": 32768
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1439479808,
+      "file_name": ".cache\\MatMulNBits_2_0_407.const",
+      "file_size": 12582912
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1452062720,
+      "file_name": ".cache\\MatMulNBits_2_0_408.const",
+      "file_size": 786432
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1452849152,
+      "file_name": ".cache\\MatMulNBits_2_0_409.const",
+      "file_size": 98304
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1452947456,
+      "file_name": ".cache\\MatMulNBits_2_0_410.const",
+      "file_size": 32768
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1452980224,
+      "file_name": ".cache\\MatMulNBits_2_0_411.const",
+      "file_size": 25165824
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1478146048,
+      "file_name": ".cache\\MatMulNBits_2_0_412.const",
+      "file_size": 12288
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1478158336,
+      "file_name": ".cache\\MatMulNBits_2_0_413.const",
+      "file_size": 786432
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1478944768,
+      "file_name": ".cache\\MatMulNBits_2_0_414.const",
+      "file_size": 196608
+    },
+    "model.layers.16.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1479141376,
+      "file_name": ".cache\\MatMulNBits_2_0_415.const",
+      "file_size": 6144
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1479147520,
+      "file_name": ".cache\\MatMulNBits_2_0_416.const",
+      "file_size": 18874368
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1498021888,
+      "file_name": ".cache\\MatMulNBits_2_0_417.const",
+      "file_size": 24576
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1498046464,
+      "file_name": ".cache\\MatMulNBits_2_0_418.const",
+      "file_size": 589824
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1498636288,
+      "file_name": ".cache\\MatMulNBits_2_0_419.const",
+      "file_size": 147456
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1498783744,
+      "file_name": ".cache\\MatMulNBits_2_0_420.const",
+      "file_size": 9437184
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1508220928,
+      "file_name": ".cache\\MatMulNBits_2_0_421.const",
+      "file_size": 12288
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1508233216,
+      "file_name": ".cache\\MatMulNBits_2_0_422.const",
+      "file_size": 294912
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1508528128,
+      "file_name": ".cache\\MatMulNBits_2_0_423.const",
+      "file_size": 73728
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1508601856,
+      "file_name": ".cache\\MatMulNBits_2_0_424.const",
+      "file_size": 9437184
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1518039040,
+      "file_name": ".cache\\MatMulNBits_2_0_425.const",
+      "file_size": 12288
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1518051328,
+      "file_name": ".cache\\MatMulNBits_2_0_426.const",
+      "file_size": 294912
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1518346240,
+      "file_name": ".cache\\MatMulNBits_2_0_427.const",
+      "file_size": 73728
+    },
+    "model.layers.16.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1518419968,
+      "file_name": ".cache\\MatMulNBits_2_0_428.const",
+      "file_size": 6144
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1518426112,
+      "file_name": ".cache\\MatMulNBits_2_0_429.const",
+      "file_size": 12582912
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1531009024,
+      "file_name": ".cache\\MatMulNBits_2_0_430.const",
+      "file_size": 786432
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1531795456,
+      "file_name": ".cache\\MatMulNBits_2_0_431.const",
+      "file_size": 98304
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1531893760,
+      "file_name": ".cache\\MatMulNBits_2_0_432.const",
+      "file_size": 32768
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1531926528,
+      "file_name": ".cache\\MatMulNBits_2_0_433.const",
+      "file_size": 12582912
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1544509440,
+      "file_name": ".cache\\MatMulNBits_2_0_434.const",
+      "file_size": 786432
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1545295872,
+      "file_name": ".cache\\MatMulNBits_2_0_435.const",
+      "file_size": 98304
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1545394176,
+      "file_name": ".cache\\MatMulNBits_2_0_436.const",
+      "file_size": 32768
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1545426944,
+      "file_name": ".cache\\MatMulNBits_2_0_437.const",
+      "file_size": 25165824
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1570592768,
+      "file_name": ".cache\\MatMulNBits_2_0_438.const",
+      "file_size": 12288
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1570605056,
+      "file_name": ".cache\\MatMulNBits_2_0_439.const",
+      "file_size": 786432
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1571391488,
+      "file_name": ".cache\\MatMulNBits_2_0_440.const",
+      "file_size": 196608
+    },
+    "model.layers.17.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1571588096,
+      "file_name": ".cache\\MatMulNBits_2_0_441.const",
+      "file_size": 6144
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1571594240,
+      "file_name": ".cache\\MatMulNBits_2_0_442.const",
+      "file_size": 18874368
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1590468608,
+      "file_name": ".cache\\MatMulNBits_2_0_443.const",
+      "file_size": 24576
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1590493184,
+      "file_name": ".cache\\MatMulNBits_2_0_444.const",
+      "file_size": 589824
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1591083008,
+      "file_name": ".cache\\MatMulNBits_2_0_445.const",
+      "file_size": 147456
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1591230464,
+      "file_name": ".cache\\MatMulNBits_2_0_446.const",
+      "file_size": 9437184
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1600667648,
+      "file_name": ".cache\\MatMulNBits_2_0_447.const",
+      "file_size": 12288
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1600679936,
+      "file_name": ".cache\\MatMulNBits_2_0_448.const",
+      "file_size": 294912
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1600974848,
+      "file_name": ".cache\\MatMulNBits_2_0_449.const",
+      "file_size": 73728
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1601048576,
+      "file_name": ".cache\\MatMulNBits_2_0_450.const",
+      "file_size": 9437184
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1610485760,
+      "file_name": ".cache\\MatMulNBits_2_0_451.const",
+      "file_size": 12288
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1610498048,
+      "file_name": ".cache\\MatMulNBits_2_0_452.const",
+      "file_size": 294912
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1610792960,
+      "file_name": ".cache\\MatMulNBits_2_0_453.const",
+      "file_size": 73728
+    },
+    "model.layers.17.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1610866688,
+      "file_name": ".cache\\MatMulNBits_2_0_454.const",
+      "file_size": 6144
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1610872832,
+      "file_name": ".cache\\MatMulNBits_2_0_455.const",
+      "file_size": 12582912
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1623455744,
+      "file_name": ".cache\\MatMulNBits_2_0_456.const",
+      "file_size": 786432
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1624242176,
+      "file_name": ".cache\\MatMulNBits_2_0_457.const",
+      "file_size": 98304
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1624340480,
+      "file_name": ".cache\\MatMulNBits_2_0_458.const",
+      "file_size": 32768
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1624373248,
+      "file_name": ".cache\\MatMulNBits_2_0_459.const",
+      "file_size": 12582912
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1636956160,
+      "file_name": ".cache\\MatMulNBits_2_0_460.const",
+      "file_size": 786432
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1637742592,
+      "file_name": ".cache\\MatMulNBits_2_0_461.const",
+      "file_size": 98304
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1637840896,
+      "file_name": ".cache\\MatMulNBits_2_0_462.const",
+      "file_size": 32768
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1637873664,
+      "file_name": ".cache\\MatMulNBits_2_0_463.const",
+      "file_size": 25165824
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1663039488,
+      "file_name": ".cache\\MatMulNBits_2_0_464.const",
+      "file_size": 12288
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1663051776,
+      "file_name": ".cache\\MatMulNBits_2_0_465.const",
+      "file_size": 786432
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1663838208,
+      "file_name": ".cache\\MatMulNBits_2_0_466.const",
+      "file_size": 196608
+    },
+    "model.layers.18.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1664034816,
+      "file_name": ".cache\\MatMulNBits_2_0_467.const",
+      "file_size": 6144
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1664040960,
+      "file_name": ".cache\\MatMulNBits_2_0_468.const",
+      "file_size": 18874368
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1682915328,
+      "file_name": ".cache\\MatMulNBits_2_0_469.const",
+      "file_size": 24576
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1682939904,
+      "file_name": ".cache\\MatMulNBits_2_0_470.const",
+      "file_size": 589824
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1683529728,
+      "file_name": ".cache\\MatMulNBits_2_0_471.const",
+      "file_size": 147456
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1683677184,
+      "file_name": ".cache\\MatMulNBits_2_0_472.const",
+      "file_size": 9437184
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1693114368,
+      "file_name": ".cache\\MatMulNBits_2_0_473.const",
+      "file_size": 12288
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1693126656,
+      "file_name": ".cache\\MatMulNBits_2_0_474.const",
+      "file_size": 294912
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1693421568,
+      "file_name": ".cache\\MatMulNBits_2_0_475.const",
+      "file_size": 73728
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1693495296,
+      "file_name": ".cache\\MatMulNBits_2_0_476.const",
+      "file_size": 9437184
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1702932480,
+      "file_name": ".cache\\MatMulNBits_2_0_477.const",
+      "file_size": 12288
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1702944768,
+      "file_name": ".cache\\MatMulNBits_2_0_478.const",
+      "file_size": 294912
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1703239680,
+      "file_name": ".cache\\MatMulNBits_2_0_479.const",
+      "file_size": 73728
+    },
+    "model.layers.18.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1703313408,
+      "file_name": ".cache\\MatMulNBits_2_0_480.const",
+      "file_size": 6144
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1703319552,
+      "file_name": ".cache\\MatMulNBits_2_0_481.const",
+      "file_size": 12582912
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1715902464,
+      "file_name": ".cache\\MatMulNBits_2_0_482.const",
+      "file_size": 786432
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1716688896,
+      "file_name": ".cache\\MatMulNBits_2_0_483.const",
+      "file_size": 98304
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1716787200,
+      "file_name": ".cache\\MatMulNBits_2_0_484.const",
+      "file_size": 32768
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1716819968,
+      "file_name": ".cache\\MatMulNBits_2_0_485.const",
+      "file_size": 12582912
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1729402880,
+      "file_name": ".cache\\MatMulNBits_2_0_486.const",
+      "file_size": 786432
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1730189312,
+      "file_name": ".cache\\MatMulNBits_2_0_487.const",
+      "file_size": 98304
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1730287616,
+      "file_name": ".cache\\MatMulNBits_2_0_488.const",
+      "file_size": 32768
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1730320384,
+      "file_name": ".cache\\MatMulNBits_2_0_489.const",
+      "file_size": 25165824
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1755486208,
+      "file_name": ".cache\\MatMulNBits_2_0_490.const",
+      "file_size": 12288
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1755498496,
+      "file_name": ".cache\\MatMulNBits_2_0_491.const",
+      "file_size": 786432
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1756284928,
+      "file_name": ".cache\\MatMulNBits_2_0_492.const",
+      "file_size": 196608
+    },
+    "model.layers.19.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1756481536,
+      "file_name": ".cache\\MatMulNBits_2_0_493.const",
+      "file_size": 6144
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1756487680,
+      "file_name": ".cache\\MatMulNBits_2_0_494.const",
+      "file_size": 18874368
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1775362048,
+      "file_name": ".cache\\MatMulNBits_2_0_495.const",
+      "file_size": 24576
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1775386624,
+      "file_name": ".cache\\MatMulNBits_2_0_496.const",
+      "file_size": 589824
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1775976448,
+      "file_name": ".cache\\MatMulNBits_2_0_497.const",
+      "file_size": 147456
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1776123904,
+      "file_name": ".cache\\MatMulNBits_2_0_498.const",
+      "file_size": 9437184
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1785561088,
+      "file_name": ".cache\\MatMulNBits_2_0_499.const",
+      "file_size": 12288
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1785573376,
+      "file_name": ".cache\\MatMulNBits_2_0_500.const",
+      "file_size": 294912
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1785868288,
+      "file_name": ".cache\\MatMulNBits_2_0_501.const",
+      "file_size": 73728
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1785942016,
+      "file_name": ".cache\\MatMulNBits_2_0_502.const",
+      "file_size": 9437184
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1795379200,
+      "file_name": ".cache\\MatMulNBits_2_0_503.const",
+      "file_size": 12288
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1795391488,
+      "file_name": ".cache\\MatMulNBits_2_0_504.const",
+      "file_size": 294912
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1795686400,
+      "file_name": ".cache\\MatMulNBits_2_0_505.const",
+      "file_size": 73728
+    },
+    "model.layers.19.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1795760128,
+      "file_name": ".cache\\MatMulNBits_2_0_506.const",
+      "file_size": 6144
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1795766272,
+      "file_name": ".cache\\MatMulNBits_2_0_507.const",
+      "file_size": 12582912
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1808349184,
+      "file_name": ".cache\\MatMulNBits_2_0_508.const",
+      "file_size": 786432
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1809135616,
+      "file_name": ".cache\\MatMulNBits_2_0_509.const",
+      "file_size": 98304
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1809233920,
+      "file_name": ".cache\\MatMulNBits_2_0_510.const",
+      "file_size": 32768
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1809266688,
+      "file_name": ".cache\\MatMulNBits_2_0_511.const",
+      "file_size": 12582912
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1821849600,
+      "file_name": ".cache\\MatMulNBits_2_0_512.const",
+      "file_size": 786432
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1822636032,
+      "file_name": ".cache\\MatMulNBits_2_0_513.const",
+      "file_size": 98304
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1822734336,
+      "file_name": ".cache\\MatMulNBits_2_0_514.const",
+      "file_size": 32768
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1822767104,
+      "file_name": ".cache\\MatMulNBits_2_0_515.const",
+      "file_size": 25165824
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1847932928,
+      "file_name": ".cache\\MatMulNBits_2_0_516.const",
+      "file_size": 12288
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1847945216,
+      "file_name": ".cache\\MatMulNBits_2_0_517.const",
+      "file_size": 786432
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1848731648,
+      "file_name": ".cache\\MatMulNBits_2_0_518.const",
+      "file_size": 196608
+    },
+    "model.layers.20.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1848928256,
+      "file_name": ".cache\\MatMulNBits_2_0_519.const",
+      "file_size": 6144
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1848934400,
+      "file_name": ".cache\\MatMulNBits_2_0_520.const",
+      "file_size": 18874368
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1867808768,
+      "file_name": ".cache\\MatMulNBits_2_0_521.const",
+      "file_size": 24576
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1867833344,
+      "file_name": ".cache\\MatMulNBits_2_0_522.const",
+      "file_size": 589824
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1868423168,
+      "file_name": ".cache\\MatMulNBits_2_0_523.const",
+      "file_size": 147456
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1868570624,
+      "file_name": ".cache\\MatMulNBits_2_0_524.const",
+      "file_size": 9437184
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1878007808,
+      "file_name": ".cache\\MatMulNBits_2_0_525.const",
+      "file_size": 12288
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1878020096,
+      "file_name": ".cache\\MatMulNBits_2_0_526.const",
+      "file_size": 294912
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1878315008,
+      "file_name": ".cache\\MatMulNBits_2_0_527.const",
+      "file_size": 73728
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1878388736,
+      "file_name": ".cache\\MatMulNBits_2_0_528.const",
+      "file_size": 9437184
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1887825920,
+      "file_name": ".cache\\MatMulNBits_2_0_529.const",
+      "file_size": 12288
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1887838208,
+      "file_name": ".cache\\MatMulNBits_2_0_530.const",
+      "file_size": 294912
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1888133120,
+      "file_name": ".cache\\MatMulNBits_2_0_531.const",
+      "file_size": 73728
+    },
+    "model.layers.20.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1888206848,
+      "file_name": ".cache\\MatMulNBits_2_0_532.const",
+      "file_size": 6144
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1888212992,
+      "file_name": ".cache\\MatMulNBits_2_0_533.const",
+      "file_size": 12582912
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1900795904,
+      "file_name": ".cache\\MatMulNBits_2_0_534.const",
+      "file_size": 786432
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1901582336,
+      "file_name": ".cache\\MatMulNBits_2_0_535.const",
+      "file_size": 98304
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1901680640,
+      "file_name": ".cache\\MatMulNBits_2_0_536.const",
+      "file_size": 32768
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1901713408,
+      "file_name": ".cache\\MatMulNBits_2_0_537.const",
+      "file_size": 12582912
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1914296320,
+      "file_name": ".cache\\MatMulNBits_2_0_538.const",
+      "file_size": 786432
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1915082752,
+      "file_name": ".cache\\MatMulNBits_2_0_539.const",
+      "file_size": 98304
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1915181056,
+      "file_name": ".cache\\MatMulNBits_2_0_540.const",
+      "file_size": 32768
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1915213824,
+      "file_name": ".cache\\MatMulNBits_2_0_541.const",
+      "file_size": 25165824
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1940379648,
+      "file_name": ".cache\\MatMulNBits_2_0_542.const",
+      "file_size": 12288
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1940391936,
+      "file_name": ".cache\\MatMulNBits_2_0_543.const",
+      "file_size": 786432
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 1941178368,
+      "file_name": ".cache\\MatMulNBits_2_0_544.const",
+      "file_size": 196608
+    },
+    "model.layers.21.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1941374976,
+      "file_name": ".cache\\MatMulNBits_2_0_545.const",
+      "file_size": 6144
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 1941381120,
+      "file_name": ".cache\\MatMulNBits_2_0_546.const",
+      "file_size": 18874368
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 1960255488,
+      "file_name": ".cache\\MatMulNBits_2_0_547.const",
+      "file_size": 24576
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 1960280064,
+      "file_name": ".cache\\MatMulNBits_2_0_548.const",
+      "file_size": 589824
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 1960869888,
+      "file_name": ".cache\\MatMulNBits_2_0_549.const",
+      "file_size": 147456
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1961017344,
+      "file_name": ".cache\\MatMulNBits_2_0_550.const",
+      "file_size": 9437184
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1970454528,
+      "file_name": ".cache\\MatMulNBits_2_0_551.const",
+      "file_size": 12288
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1970466816,
+      "file_name": ".cache\\MatMulNBits_2_0_552.const",
+      "file_size": 294912
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1970761728,
+      "file_name": ".cache\\MatMulNBits_2_0_553.const",
+      "file_size": 73728
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 1970835456,
+      "file_name": ".cache\\MatMulNBits_2_0_554.const",
+      "file_size": 9437184
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 1980272640,
+      "file_name": ".cache\\MatMulNBits_2_0_555.const",
+      "file_size": 12288
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 1980284928,
+      "file_name": ".cache\\MatMulNBits_2_0_556.const",
+      "file_size": 294912
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 1980579840,
+      "file_name": ".cache\\MatMulNBits_2_0_557.const",
+      "file_size": 73728
+    },
+    "model.layers.21.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 1980653568,
+      "file_name": ".cache\\MatMulNBits_2_0_558.const",
+      "file_size": 6144
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1980659712,
+      "file_name": ".cache\\MatMulNBits_2_0_559.const",
+      "file_size": 12582912
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 1993242624,
+      "file_name": ".cache\\MatMulNBits_2_0_560.const",
+      "file_size": 786432
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 1994029056,
+      "file_name": ".cache\\MatMulNBits_2_0_561.const",
+      "file_size": 98304
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1994127360,
+      "file_name": ".cache\\MatMulNBits_2_0_562.const",
+      "file_size": 32768
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 1994160128,
+      "file_name": ".cache\\MatMulNBits_2_0_563.const",
+      "file_size": 12582912
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2006743040,
+      "file_name": ".cache\\MatMulNBits_2_0_564.const",
+      "file_size": 786432
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2007529472,
+      "file_name": ".cache\\MatMulNBits_2_0_565.const",
+      "file_size": 98304
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2007627776,
+      "file_name": ".cache\\MatMulNBits_2_0_566.const",
+      "file_size": 32768
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2007660544,
+      "file_name": ".cache\\MatMulNBits_2_0_567.const",
+      "file_size": 25165824
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2032826368,
+      "file_name": ".cache\\MatMulNBits_2_0_568.const",
+      "file_size": 12288
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2032838656,
+      "file_name": ".cache\\MatMulNBits_2_0_569.const",
+      "file_size": 786432
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2033625088,
+      "file_name": ".cache\\MatMulNBits_2_0_570.const",
+      "file_size": 196608
+    },
+    "model.layers.22.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2033821696,
+      "file_name": ".cache\\MatMulNBits_2_0_571.const",
+      "file_size": 6144
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2033827840,
+      "file_name": ".cache\\MatMulNBits_2_0_572.const",
+      "file_size": 18874368
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2052702208,
+      "file_name": ".cache\\MatMulNBits_2_0_573.const",
+      "file_size": 24576
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2052726784,
+      "file_name": ".cache\\MatMulNBits_2_0_574.const",
+      "file_size": 589824
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2053316608,
+      "file_name": ".cache\\MatMulNBits_2_0_575.const",
+      "file_size": 147456
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2053464064,
+      "file_name": ".cache\\MatMulNBits_2_0_576.const",
+      "file_size": 9437184
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2062901248,
+      "file_name": ".cache\\MatMulNBits_2_0_577.const",
+      "file_size": 12288
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2062913536,
+      "file_name": ".cache\\MatMulNBits_2_0_578.const",
+      "file_size": 294912
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2063208448,
+      "file_name": ".cache\\MatMulNBits_2_0_579.const",
+      "file_size": 73728
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2063282176,
+      "file_name": ".cache\\MatMulNBits_2_0_580.const",
+      "file_size": 9437184
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2072719360,
+      "file_name": ".cache\\MatMulNBits_2_0_581.const",
+      "file_size": 12288
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2072731648,
+      "file_name": ".cache\\MatMulNBits_2_0_582.const",
+      "file_size": 294912
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2073026560,
+      "file_name": ".cache\\MatMulNBits_2_0_583.const",
+      "file_size": 73728
+    },
+    "model.layers.22.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2073100288,
+      "file_name": ".cache\\MatMulNBits_2_0_584.const",
+      "file_size": 6144
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2073106432,
+      "file_name": ".cache\\MatMulNBits_2_0_585.const",
+      "file_size": 12582912
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2085689344,
+      "file_name": ".cache\\MatMulNBits_2_0_586.const",
+      "file_size": 786432
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2086475776,
+      "file_name": ".cache\\MatMulNBits_2_0_587.const",
+      "file_size": 98304
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2086574080,
+      "file_name": ".cache\\MatMulNBits_2_0_588.const",
+      "file_size": 32768
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2086606848,
+      "file_name": ".cache\\MatMulNBits_2_0_589.const",
+      "file_size": 12582912
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2099189760,
+      "file_name": ".cache\\MatMulNBits_2_0_590.const",
+      "file_size": 786432
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2099976192,
+      "file_name": ".cache\\MatMulNBits_2_0_591.const",
+      "file_size": 98304
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2100074496,
+      "file_name": ".cache\\MatMulNBits_2_0_592.const",
+      "file_size": 32768
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2100107264,
+      "file_name": ".cache\\MatMulNBits_2_0_593.const",
+      "file_size": 25165824
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2125273088,
+      "file_name": ".cache\\MatMulNBits_2_0_594.const",
+      "file_size": 12288
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2125285376,
+      "file_name": ".cache\\MatMulNBits_2_0_595.const",
+      "file_size": 786432
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2126071808,
+      "file_name": ".cache\\MatMulNBits_2_0_596.const",
+      "file_size": 196608
+    },
+    "model.layers.23.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2126268416,
+      "file_name": ".cache\\MatMulNBits_2_0_597.const",
+      "file_size": 6144
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2126274560,
+      "file_name": ".cache\\MatMulNBits_2_0_598.const",
+      "file_size": 18874368
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2145148928,
+      "file_name": ".cache\\MatMulNBits_2_0_599.const",
+      "file_size": 24576
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2145173504,
+      "file_name": ".cache\\MatMulNBits_2_0_600.const",
+      "file_size": 589824
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2145763328,
+      "file_name": ".cache\\MatMulNBits_2_0_601.const",
+      "file_size": 147456
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2145910784,
+      "file_name": ".cache\\MatMulNBits_2_0_602.const",
+      "file_size": 9437184
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2155347968,
+      "file_name": ".cache\\MatMulNBits_2_0_603.const",
+      "file_size": 12288
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2155360256,
+      "file_name": ".cache\\MatMulNBits_2_0_604.const",
+      "file_size": 294912
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2155655168,
+      "file_name": ".cache\\MatMulNBits_2_0_605.const",
+      "file_size": 73728
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2155728896,
+      "file_name": ".cache\\MatMulNBits_2_0_606.const",
+      "file_size": 9437184
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2165166080,
+      "file_name": ".cache\\MatMulNBits_2_0_607.const",
+      "file_size": 12288
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2165178368,
+      "file_name": ".cache\\MatMulNBits_2_0_608.const",
+      "file_size": 294912
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2165473280,
+      "file_name": ".cache\\MatMulNBits_2_0_609.const",
+      "file_size": 73728
+    },
+    "model.layers.23.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2165547008,
+      "file_name": ".cache\\MatMulNBits_2_0_610.const",
+      "file_size": 6144
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2165553152,
+      "file_name": ".cache\\MatMulNBits_2_0_611.const",
+      "file_size": 12582912
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2178136064,
+      "file_name": ".cache\\MatMulNBits_2_0_612.const",
+      "file_size": 786432
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2178922496,
+      "file_name": ".cache\\MatMulNBits_2_0_613.const",
+      "file_size": 98304
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2179020800,
+      "file_name": ".cache\\MatMulNBits_2_0_614.const",
+      "file_size": 32768
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2179053568,
+      "file_name": ".cache\\MatMulNBits_2_0_615.const",
+      "file_size": 12582912
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2191636480,
+      "file_name": ".cache\\MatMulNBits_2_0_616.const",
+      "file_size": 786432
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2192422912,
+      "file_name": ".cache\\MatMulNBits_2_0_617.const",
+      "file_size": 98304
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2192521216,
+      "file_name": ".cache\\MatMulNBits_2_0_618.const",
+      "file_size": 32768
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2192553984,
+      "file_name": ".cache\\MatMulNBits_2_0_619.const",
+      "file_size": 25165824
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2217719808,
+      "file_name": ".cache\\MatMulNBits_2_0_620.const",
+      "file_size": 12288
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2217732096,
+      "file_name": ".cache\\MatMulNBits_2_0_621.const",
+      "file_size": 786432
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2218518528,
+      "file_name": ".cache\\MatMulNBits_2_0_622.const",
+      "file_size": 196608
+    },
+    "model.layers.24.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2218715136,
+      "file_name": ".cache\\MatMulNBits_2_0_623.const",
+      "file_size": 6144
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2218721280,
+      "file_name": ".cache\\MatMulNBits_2_0_624.const",
+      "file_size": 18874368
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2237595648,
+      "file_name": ".cache\\MatMulNBits_2_0_625.const",
+      "file_size": 24576
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2237620224,
+      "file_name": ".cache\\MatMulNBits_2_0_626.const",
+      "file_size": 589824
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2238210048,
+      "file_name": ".cache\\MatMulNBits_2_0_627.const",
+      "file_size": 147456
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2238357504,
+      "file_name": ".cache\\MatMulNBits_2_0_628.const",
+      "file_size": 9437184
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2247794688,
+      "file_name": ".cache\\MatMulNBits_2_0_629.const",
+      "file_size": 12288
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2247806976,
+      "file_name": ".cache\\MatMulNBits_2_0_630.const",
+      "file_size": 294912
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2248101888,
+      "file_name": ".cache\\MatMulNBits_2_0_631.const",
+      "file_size": 73728
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2248175616,
+      "file_name": ".cache\\MatMulNBits_2_0_632.const",
+      "file_size": 9437184
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2257612800,
+      "file_name": ".cache\\MatMulNBits_2_0_633.const",
+      "file_size": 12288
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2257625088,
+      "file_name": ".cache\\MatMulNBits_2_0_634.const",
+      "file_size": 294912
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2257920000,
+      "file_name": ".cache\\MatMulNBits_2_0_635.const",
+      "file_size": 73728
+    },
+    "model.layers.24.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2257993728,
+      "file_name": ".cache\\MatMulNBits_2_0_636.const",
+      "file_size": 6144
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2257999872,
+      "file_name": ".cache\\MatMulNBits_2_0_637.const",
+      "file_size": 12582912
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2270582784,
+      "file_name": ".cache\\MatMulNBits_2_0_638.const",
+      "file_size": 786432
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2271369216,
+      "file_name": ".cache\\MatMulNBits_2_0_639.const",
+      "file_size": 98304
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2271467520,
+      "file_name": ".cache\\MatMulNBits_2_0_640.const",
+      "file_size": 32768
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2271500288,
+      "file_name": ".cache\\MatMulNBits_2_0_641.const",
+      "file_size": 12582912
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2284083200,
+      "file_name": ".cache\\MatMulNBits_2_0_642.const",
+      "file_size": 786432
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2284869632,
+      "file_name": ".cache\\MatMulNBits_2_0_643.const",
+      "file_size": 98304
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2284967936,
+      "file_name": ".cache\\MatMulNBits_2_0_644.const",
+      "file_size": 32768
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2285000704,
+      "file_name": ".cache\\MatMulNBits_2_0_645.const",
+      "file_size": 25165824
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2310166528,
+      "file_name": ".cache\\MatMulNBits_2_0_646.const",
+      "file_size": 12288
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2310178816,
+      "file_name": ".cache\\MatMulNBits_2_0_647.const",
+      "file_size": 786432
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2310965248,
+      "file_name": ".cache\\MatMulNBits_2_0_648.const",
+      "file_size": 196608
+    },
+    "model.layers.25.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2311161856,
+      "file_name": ".cache\\MatMulNBits_2_0_649.const",
+      "file_size": 6144
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2311168000,
+      "file_name": ".cache\\MatMulNBits_2_0_650.const",
+      "file_size": 18874368
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2330042368,
+      "file_name": ".cache\\MatMulNBits_2_0_651.const",
+      "file_size": 24576
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2330066944,
+      "file_name": ".cache\\MatMulNBits_2_0_652.const",
+      "file_size": 589824
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2330656768,
+      "file_name": ".cache\\MatMulNBits_2_0_653.const",
+      "file_size": 147456
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2330804224,
+      "file_name": ".cache\\MatMulNBits_2_0_654.const",
+      "file_size": 9437184
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2340241408,
+      "file_name": ".cache\\MatMulNBits_2_0_655.const",
+      "file_size": 12288
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2340253696,
+      "file_name": ".cache\\MatMulNBits_2_0_656.const",
+      "file_size": 294912
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2340548608,
+      "file_name": ".cache\\MatMulNBits_2_0_657.const",
+      "file_size": 73728
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2340622336,
+      "file_name": ".cache\\MatMulNBits_2_0_658.const",
+      "file_size": 9437184
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2350059520,
+      "file_name": ".cache\\MatMulNBits_2_0_659.const",
+      "file_size": 12288
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2350071808,
+      "file_name": ".cache\\MatMulNBits_2_0_660.const",
+      "file_size": 294912
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2350366720,
+      "file_name": ".cache\\MatMulNBits_2_0_661.const",
+      "file_size": 73728
+    },
+    "model.layers.25.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2350440448,
+      "file_name": ".cache\\MatMulNBits_2_0_662.const",
+      "file_size": 6144
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2350446592,
+      "file_name": ".cache\\MatMulNBits_2_0_663.const",
+      "file_size": 12582912
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2363029504,
+      "file_name": ".cache\\MatMulNBits_2_0_664.const",
+      "file_size": 786432
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2363815936,
+      "file_name": ".cache\\MatMulNBits_2_0_665.const",
+      "file_size": 98304
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2363914240,
+      "file_name": ".cache\\MatMulNBits_2_0_666.const",
+      "file_size": 32768
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2363947008,
+      "file_name": ".cache\\MatMulNBits_2_0_667.const",
+      "file_size": 12582912
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2376529920,
+      "file_name": ".cache\\MatMulNBits_2_0_668.const",
+      "file_size": 786432
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2377316352,
+      "file_name": ".cache\\MatMulNBits_2_0_669.const",
+      "file_size": 98304
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2377414656,
+      "file_name": ".cache\\MatMulNBits_2_0_670.const",
+      "file_size": 32768
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2377447424,
+      "file_name": ".cache\\MatMulNBits_2_0_671.const",
+      "file_size": 25165824
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2402613248,
+      "file_name": ".cache\\MatMulNBits_2_0_672.const",
+      "file_size": 12288
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2402625536,
+      "file_name": ".cache\\MatMulNBits_2_0_673.const",
+      "file_size": 786432
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2403411968,
+      "file_name": ".cache\\MatMulNBits_2_0_674.const",
+      "file_size": 196608
+    },
+    "model.layers.26.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2403608576,
+      "file_name": ".cache\\MatMulNBits_2_0_675.const",
+      "file_size": 6144
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2403614720,
+      "file_name": ".cache\\MatMulNBits_2_0_676.const",
+      "file_size": 18874368
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2422489088,
+      "file_name": ".cache\\MatMulNBits_2_0_677.const",
+      "file_size": 24576
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2422513664,
+      "file_name": ".cache\\MatMulNBits_2_0_678.const",
+      "file_size": 589824
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2423103488,
+      "file_name": ".cache\\MatMulNBits_2_0_679.const",
+      "file_size": 147456
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2423250944,
+      "file_name": ".cache\\MatMulNBits_2_0_680.const",
+      "file_size": 9437184
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2432688128,
+      "file_name": ".cache\\MatMulNBits_2_0_681.const",
+      "file_size": 12288
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2432700416,
+      "file_name": ".cache\\MatMulNBits_2_0_682.const",
+      "file_size": 294912
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2432995328,
+      "file_name": ".cache\\MatMulNBits_2_0_683.const",
+      "file_size": 73728
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2433069056,
+      "file_name": ".cache\\MatMulNBits_2_0_684.const",
+      "file_size": 9437184
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2442506240,
+      "file_name": ".cache\\MatMulNBits_2_0_685.const",
+      "file_size": 12288
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2442518528,
+      "file_name": ".cache\\MatMulNBits_2_0_686.const",
+      "file_size": 294912
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2442813440,
+      "file_name": ".cache\\MatMulNBits_2_0_687.const",
+      "file_size": 73728
+    },
+    "model.layers.26.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2442887168,
+      "file_name": ".cache\\MatMulNBits_2_0_688.const",
+      "file_size": 6144
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2442893312,
+      "file_name": ".cache\\MatMulNBits_2_0_689.const",
+      "file_size": 12582912
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2455476224,
+      "file_name": ".cache\\MatMulNBits_2_0_690.const",
+      "file_size": 786432
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2456262656,
+      "file_name": ".cache\\MatMulNBits_2_0_691.const",
+      "file_size": 98304
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2456360960,
+      "file_name": ".cache\\MatMulNBits_2_0_692.const",
+      "file_size": 32768
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2456393728,
+      "file_name": ".cache\\MatMulNBits_2_0_693.const",
+      "file_size": 12582912
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2468976640,
+      "file_name": ".cache\\MatMulNBits_2_0_694.const",
+      "file_size": 786432
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2469763072,
+      "file_name": ".cache\\MatMulNBits_2_0_695.const",
+      "file_size": 98304
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2469861376,
+      "file_name": ".cache\\MatMulNBits_2_0_696.const",
+      "file_size": 32768
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2469894144,
+      "file_name": ".cache\\MatMulNBits_2_0_697.const",
+      "file_size": 25165824
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2495059968,
+      "file_name": ".cache\\MatMulNBits_2_0_698.const",
+      "file_size": 12288
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2495072256,
+      "file_name": ".cache\\MatMulNBits_2_0_699.const",
+      "file_size": 786432
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2495858688,
+      "file_name": ".cache\\MatMulNBits_2_0_700.const",
+      "file_size": 196608
+    },
+    "model.layers.27.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2496055296,
+      "file_name": ".cache\\MatMulNBits_2_0_701.const",
+      "file_size": 6144
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2496061440,
+      "file_name": ".cache\\MatMulNBits_2_0_702.const",
+      "file_size": 18874368
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2514935808,
+      "file_name": ".cache\\MatMulNBits_2_0_703.const",
+      "file_size": 24576
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2514960384,
+      "file_name": ".cache\\MatMulNBits_2_0_704.const",
+      "file_size": 589824
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2515550208,
+      "file_name": ".cache\\MatMulNBits_2_0_705.const",
+      "file_size": 147456
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2515697664,
+      "file_name": ".cache\\MatMulNBits_2_0_706.const",
+      "file_size": 9437184
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2525134848,
+      "file_name": ".cache\\MatMulNBits_2_0_707.const",
+      "file_size": 12288
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2525147136,
+      "file_name": ".cache\\MatMulNBits_2_0_708.const",
+      "file_size": 294912
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2525442048,
+      "file_name": ".cache\\MatMulNBits_2_0_709.const",
+      "file_size": 73728
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2525515776,
+      "file_name": ".cache\\MatMulNBits_2_0_710.const",
+      "file_size": 9437184
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2534952960,
+      "file_name": ".cache\\MatMulNBits_2_0_711.const",
+      "file_size": 12288
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2534965248,
+      "file_name": ".cache\\MatMulNBits_2_0_712.const",
+      "file_size": 294912
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2535260160,
+      "file_name": ".cache\\MatMulNBits_2_0_713.const",
+      "file_size": 73728
+    },
+    "model.layers.27.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2535333888,
+      "file_name": ".cache\\MatMulNBits_2_0_714.const",
+      "file_size": 6144
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2535340032,
+      "file_name": ".cache\\MatMulNBits_2_0_715.const",
+      "file_size": 12582912
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2547922944,
+      "file_name": ".cache\\MatMulNBits_2_0_716.const",
+      "file_size": 786432
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2548709376,
+      "file_name": ".cache\\MatMulNBits_2_0_717.const",
+      "file_size": 98304
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2548807680,
+      "file_name": ".cache\\MatMulNBits_2_0_718.const",
+      "file_size": 32768
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2548840448,
+      "file_name": ".cache\\MatMulNBits_2_0_719.const",
+      "file_size": 12582912
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2561423360,
+      "file_name": ".cache\\MatMulNBits_2_0_720.const",
+      "file_size": 786432
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2562209792,
+      "file_name": ".cache\\MatMulNBits_2_0_721.const",
+      "file_size": 98304
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2562308096,
+      "file_name": ".cache\\MatMulNBits_2_0_722.const",
+      "file_size": 32768
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2562340864,
+      "file_name": ".cache\\MatMulNBits_2_0_723.const",
+      "file_size": 25165824
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2587506688,
+      "file_name": ".cache\\MatMulNBits_2_0_724.const",
+      "file_size": 12288
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2587518976,
+      "file_name": ".cache\\MatMulNBits_2_0_725.const",
+      "file_size": 786432
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2588305408,
+      "file_name": ".cache\\MatMulNBits_2_0_726.const",
+      "file_size": 196608
+    },
+    "model.layers.28.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2588502016,
+      "file_name": ".cache\\MatMulNBits_2_0_727.const",
+      "file_size": 6144
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2588508160,
+      "file_name": ".cache\\MatMulNBits_2_0_728.const",
+      "file_size": 18874368
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2607382528,
+      "file_name": ".cache\\MatMulNBits_2_0_729.const",
+      "file_size": 24576
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2607407104,
+      "file_name": ".cache\\MatMulNBits_2_0_730.const",
+      "file_size": 589824
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2607996928,
+      "file_name": ".cache\\MatMulNBits_2_0_731.const",
+      "file_size": 147456
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2608144384,
+      "file_name": ".cache\\MatMulNBits_2_0_732.const",
+      "file_size": 9437184
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2617581568,
+      "file_name": ".cache\\MatMulNBits_2_0_733.const",
+      "file_size": 12288
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2617593856,
+      "file_name": ".cache\\MatMulNBits_2_0_734.const",
+      "file_size": 294912
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2617888768,
+      "file_name": ".cache\\MatMulNBits_2_0_735.const",
+      "file_size": 73728
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2617962496,
+      "file_name": ".cache\\MatMulNBits_2_0_736.const",
+      "file_size": 9437184
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2627399680,
+      "file_name": ".cache\\MatMulNBits_2_0_737.const",
+      "file_size": 12288
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2627411968,
+      "file_name": ".cache\\MatMulNBits_2_0_738.const",
+      "file_size": 294912
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2627706880,
+      "file_name": ".cache\\MatMulNBits_2_0_739.const",
+      "file_size": 73728
+    },
+    "model.layers.28.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2627780608,
+      "file_name": ".cache\\MatMulNBits_2_0_740.const",
+      "file_size": 6144
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2627786752,
+      "file_name": ".cache\\MatMulNBits_2_0_741.const",
+      "file_size": 12582912
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2640369664,
+      "file_name": ".cache\\MatMulNBits_2_0_742.const",
+      "file_size": 786432
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2641156096,
+      "file_name": ".cache\\MatMulNBits_2_0_743.const",
+      "file_size": 98304
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2641254400,
+      "file_name": ".cache\\MatMulNBits_2_0_744.const",
+      "file_size": 32768
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2641287168,
+      "file_name": ".cache\\MatMulNBits_2_0_745.const",
+      "file_size": 12582912
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2653870080,
+      "file_name": ".cache\\MatMulNBits_2_0_746.const",
+      "file_size": 786432
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2654656512,
+      "file_name": ".cache\\MatMulNBits_2_0_747.const",
+      "file_size": 98304
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2654754816,
+      "file_name": ".cache\\MatMulNBits_2_0_748.const",
+      "file_size": 32768
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2654787584,
+      "file_name": ".cache\\MatMulNBits_2_0_749.const",
+      "file_size": 25165824
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2679953408,
+      "file_name": ".cache\\MatMulNBits_2_0_750.const",
+      "file_size": 12288
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2679965696,
+      "file_name": ".cache\\MatMulNBits_2_0_751.const",
+      "file_size": 786432
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2680752128,
+      "file_name": ".cache\\MatMulNBits_2_0_752.const",
+      "file_size": 196608
+    },
+    "model.layers.29.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2680948736,
+      "file_name": ".cache\\MatMulNBits_2_0_753.const",
+      "file_size": 6144
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2680954880,
+      "file_name": ".cache\\MatMulNBits_2_0_754.const",
+      "file_size": 18874368
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2699829248,
+      "file_name": ".cache\\MatMulNBits_2_0_755.const",
+      "file_size": 24576
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2699853824,
+      "file_name": ".cache\\MatMulNBits_2_0_756.const",
+      "file_size": 589824
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2700443648,
+      "file_name": ".cache\\MatMulNBits_2_0_757.const",
+      "file_size": 147456
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2700591104,
+      "file_name": ".cache\\MatMulNBits_2_0_758.const",
+      "file_size": 9437184
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2710028288,
+      "file_name": ".cache\\MatMulNBits_2_0_759.const",
+      "file_size": 12288
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2710040576,
+      "file_name": ".cache\\MatMulNBits_2_0_760.const",
+      "file_size": 294912
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2710335488,
+      "file_name": ".cache\\MatMulNBits_2_0_761.const",
+      "file_size": 73728
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2710409216,
+      "file_name": ".cache\\MatMulNBits_2_0_762.const",
+      "file_size": 9437184
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2719846400,
+      "file_name": ".cache\\MatMulNBits_2_0_763.const",
+      "file_size": 12288
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2719858688,
+      "file_name": ".cache\\MatMulNBits_2_0_764.const",
+      "file_size": 294912
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2720153600,
+      "file_name": ".cache\\MatMulNBits_2_0_765.const",
+      "file_size": 73728
+    },
+    "model.layers.29.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2720227328,
+      "file_name": ".cache\\MatMulNBits_2_0_766.const",
+      "file_size": 6144
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2720233472,
+      "file_name": ".cache\\MatMulNBits_2_0_767.const",
+      "file_size": 12582912
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2732816384,
+      "file_name": ".cache\\MatMulNBits_2_0_768.const",
+      "file_size": 786432
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2733602816,
+      "file_name": ".cache\\MatMulNBits_2_0_769.const",
+      "file_size": 98304
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2733701120,
+      "file_name": ".cache\\MatMulNBits_2_0_770.const",
+      "file_size": 32768
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2733733888,
+      "file_name": ".cache\\MatMulNBits_2_0_771.const",
+      "file_size": 12582912
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2746316800,
+      "file_name": ".cache\\MatMulNBits_2_0_772.const",
+      "file_size": 786432
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2747103232,
+      "file_name": ".cache\\MatMulNBits_2_0_773.const",
+      "file_size": 98304
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2747201536,
+      "file_name": ".cache\\MatMulNBits_2_0_774.const",
+      "file_size": 32768
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2747234304,
+      "file_name": ".cache\\MatMulNBits_2_0_775.const",
+      "file_size": 25165824
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2772400128,
+      "file_name": ".cache\\MatMulNBits_2_0_776.const",
+      "file_size": 12288
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2772412416,
+      "file_name": ".cache\\MatMulNBits_2_0_777.const",
+      "file_size": 786432
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2773198848,
+      "file_name": ".cache\\MatMulNBits_2_0_778.const",
+      "file_size": 196608
+    },
+    "model.layers.30.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2773395456,
+      "file_name": ".cache\\MatMulNBits_2_0_779.const",
+      "file_size": 6144
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2773401600,
+      "file_name": ".cache\\MatMulNBits_2_0_780.const",
+      "file_size": 18874368
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2792275968,
+      "file_name": ".cache\\MatMulNBits_2_0_781.const",
+      "file_size": 24576
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2792300544,
+      "file_name": ".cache\\MatMulNBits_2_0_782.const",
+      "file_size": 589824
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2792890368,
+      "file_name": ".cache\\MatMulNBits_2_0_783.const",
+      "file_size": 147456
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2793037824,
+      "file_name": ".cache\\MatMulNBits_2_0_784.const",
+      "file_size": 9437184
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2802475008,
+      "file_name": ".cache\\MatMulNBits_2_0_785.const",
+      "file_size": 12288
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2802487296,
+      "file_name": ".cache\\MatMulNBits_2_0_786.const",
+      "file_size": 294912
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2802782208,
+      "file_name": ".cache\\MatMulNBits_2_0_787.const",
+      "file_size": 73728
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2802855936,
+      "file_name": ".cache\\MatMulNBits_2_0_788.const",
+      "file_size": 9437184
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2812293120,
+      "file_name": ".cache\\MatMulNBits_2_0_789.const",
+      "file_size": 12288
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2812305408,
+      "file_name": ".cache\\MatMulNBits_2_0_790.const",
+      "file_size": 294912
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2812600320,
+      "file_name": ".cache\\MatMulNBits_2_0_791.const",
+      "file_size": 73728
+    },
+    "model.layers.30.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2812674048,
+      "file_name": ".cache\\MatMulNBits_2_0_792.const",
+      "file_size": 6144
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2812680192,
+      "file_name": ".cache\\MatMulNBits_2_0_793.const",
+      "file_size": 12582912
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2825263104,
+      "file_name": ".cache\\MatMulNBits_2_0_794.const",
+      "file_size": 786432
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2826049536,
+      "file_name": ".cache\\MatMulNBits_2_0_795.const",
+      "file_size": 98304
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2826147840,
+      "file_name": ".cache\\MatMulNBits_2_0_796.const",
+      "file_size": 32768
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2826180608,
+      "file_name": ".cache\\MatMulNBits_2_0_797.const",
+      "file_size": 12582912
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2838763520,
+      "file_name": ".cache\\MatMulNBits_2_0_798.const",
+      "file_size": 786432
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2839549952,
+      "file_name": ".cache\\MatMulNBits_2_0_799.const",
+      "file_size": 98304
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2839648256,
+      "file_name": ".cache\\MatMulNBits_2_0_800.const",
+      "file_size": 32768
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2839681024,
+      "file_name": ".cache\\MatMulNBits_2_0_801.const",
+      "file_size": 25165824
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2864846848,
+      "file_name": ".cache\\MatMulNBits_2_0_802.const",
+      "file_size": 12288
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2864859136,
+      "file_name": ".cache\\MatMulNBits_2_0_803.const",
+      "file_size": 786432
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2865645568,
+      "file_name": ".cache\\MatMulNBits_2_0_804.const",
+      "file_size": 196608
+    },
+    "model.layers.31.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2865842176,
+      "file_name": ".cache\\MatMulNBits_2_0_805.const",
+      "file_size": 6144
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        6144
+      ],
+      "size_in_bytes": 18874368,
+      "op_tensor_size": 18874368,
+      "offset": 2865848320,
+      "file_name": ".cache\\MatMulNBits_2_0_806.const",
+      "file_size": 18874368
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        6144
+      ],
+      "size_in_bytes": 24576,
+      "op_tensor_size": 24576,
+      "offset": 2884722688,
+      "file_name": ".cache\\MatMulNBits_2_0_807.const",
+      "file_size": 24576
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 589824,
+      "op_tensor_size": 589824,
+      "offset": 2884747264,
+      "file_name": ".cache\\MatMulNBits_2_0_808.const",
+      "file_size": 589824
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        147456
+      ],
+      "size_in_bytes": 147456,
+      "op_tensor_size": 147456,
+      "offset": 2885337088,
+      "file_name": ".cache\\MatMulNBits_2_0_809.const",
+      "file_size": 147456
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2885484544,
+      "file_name": ".cache\\MatMulNBits_2_0_810.const",
+      "file_size": 9437184
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2894921728,
+      "file_name": ".cache\\MatMulNBits_2_0_811.const",
+      "file_size": 12288
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2894934016,
+      "file_name": ".cache\\MatMulNBits_2_0_812.const",
+      "file_size": 294912
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2895228928,
+      "file_name": ".cache\\MatMulNBits_2_0_813.const",
+      "file_size": 73728
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        3072
+      ],
+      "size_in_bytes": 9437184,
+      "op_tensor_size": 9437184,
+      "offset": 2895302656,
+      "file_name": ".cache\\MatMulNBits_2_0_814.const",
+      "file_size": 9437184
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2904739840,
+      "file_name": ".cache\\MatMulNBits_2_0_815.const",
+      "file_size": 12288
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 294912,
+      "op_tensor_size": 294912,
+      "offset": 2904752128,
+      "file_name": ".cache\\MatMulNBits_2_0_816.const",
+      "file_size": 294912
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        73728
+      ],
+      "size_in_bytes": 73728,
+      "op_tensor_size": 73728,
+      "offset": 2905047040,
+      "file_name": ".cache\\MatMulNBits_2_0_817.const",
+      "file_size": 73728
+    },
+    "model.layers.31.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2905120768,
+      "file_name": ".cache\\MatMulNBits_2_0_818.const",
+      "file_size": 6144
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2905126912,
+      "file_name": ".cache\\MatMulNBits_2_0_819.const",
+      "file_size": 12582912
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2917709824,
+      "file_name": ".cache\\MatMulNBits_2_0_820.const",
+      "file_size": 786432
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2918496256,
+      "file_name": ".cache\\MatMulNBits_2_0_821.const",
+      "file_size": 98304
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2918594560,
+      "file_name": ".cache\\MatMulNBits_2_0_822.const",
+      "file_size": 32768
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        8192,
+        24,
+        64
+      ],
+      "size_in_bytes": 12582912,
+      "op_tensor_size": 12582912,
+      "offset": 2918627328,
+      "file_name": ".cache\\MatMulNBits_2_0_823.const",
+      "file_size": 12582912
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2931210240,
+      "file_name": ".cache\\MatMulNBits_2_0_824.const",
+      "file_size": 786432
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        98304
+      ],
+      "size_in_bytes": 98304,
+      "op_tensor_size": 98304,
+      "offset": 2931996672,
+      "file_name": ".cache\\MatMulNBits_2_0_825.const",
+      "file_size": 98304
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        8192
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2932094976,
+      "file_name": ".cache\\MatMulNBits_2_0_826.const",
+      "file_size": 32768
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        8192,
+        3072
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 2932127744,
+      "file_name": ".cache\\MatMulNBits_2_0_827.const",
+      "file_size": 25165824
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 12288,
+      "op_tensor_size": 12288,
+      "offset": 2957293568,
+      "file_name": ".cache\\MatMulNBits_2_0_828.const",
+      "file_size": 12288
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 786432,
+      "op_tensor_size": 786432,
+      "offset": 2957305856,
+      "file_name": ".cache\\MatMulNBits_2_0_829.const",
+      "file_size": 786432
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        196608
+      ],
+      "size_in_bytes": 196608,
+      "op_tensor_size": 196608,
+      "offset": 2958092288,
+      "file_name": ".cache\\MatMulNBits_2_0_830.const",
+      "file_size": 196608
+    },
+    "model.layers.32.final_norm_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        3072
+      ],
+      "size_in_bytes": 6144,
+      "op_tensor_size": 6144,
+      "offset": 2958288896,
+      "file_name": ".cache\\MatMulNBits_2_0_831.const",
+      "file_size": 6144
+    },
+    "lm_head.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        3072,
+        32064
+      ],
+      "size_in_bytes": 98500608,
+      "op_tensor_size": 98500608,
+      "offset": 2958295040,
+      "file_name": ".cache\\MatMulNBits_2_0_832.const",
+      "file_size": 98500608
+    },
+    "lm_head.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32064
+      ],
+      "size_in_bytes": 128256,
+      "op_tensor_size": 128256,
+      "offset": 3056795648,
+      "file_name": ".cache\\MatMulNBits_2_0_833.const",
+      "file_size": 128256
+    },
+    "lm_head.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        769536
+      ],
+      "size_in_bytes": 3078144,
+      "op_tensor_size": 3078144,
+      "offset": 3056923904,
+      "file_name": ".cache\\MatMulNBits_2_0_834.const",
+      "file_size": 3078144
+    },
+    "lm_head.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        769536
+      ],
+      "size_in_bytes": 769536,
+      "op_tensor_size": 769536,
+      "offset": 3060002048,
+      "file_name": ".cache\\MatMulNBits_2_0_835.const",
+      "file_size": 769536
+    },
+    "past_key_values.0.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 0
+    },
+    "past_key_values.0.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 25165824
+    },
+    "present.0.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 0
+    },
+    "present.0.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 25165824
+    },
+    "past_key_values.1.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 50331648
+    },
+    "past_key_values.1.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 75497472
+    },
+    "present.1.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 50331648
+    },
+    "present.1.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 75497472
+    },
+    "past_key_values.2.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 100663296
+    },
+    "past_key_values.2.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 125829120
+    },
+    "present.2.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 100663296
+    },
+    "present.2.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 125829120
+    },
+    "past_key_values.3.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 150994944
+    },
+    "past_key_values.3.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 176160768
+    },
+    "present.3.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 150994944
+    },
+    "present.3.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 176160768
+    },
+    "past_key_values.4.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 201326592
+    },
+    "past_key_values.4.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 226492416
+    },
+    "present.4.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 201326592
+    },
+    "present.4.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 226492416
+    },
+    "past_key_values.5.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 251658240
+    },
+    "past_key_values.5.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 276824064
+    },
+    "present.5.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 251658240
+    },
+    "present.5.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 276824064
+    },
+    "past_key_values.6.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 301989888
+    },
+    "past_key_values.6.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 327155712
+    },
+    "present.6.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 301989888
+    },
+    "present.6.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 327155712
+    },
+    "past_key_values.7.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 352321536
+    },
+    "past_key_values.7.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 377487360
+    },
+    "present.7.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 352321536
+    },
+    "present.7.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 377487360
+    },
+    "past_key_values.8.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 402653184
+    },
+    "past_key_values.8.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 427819008
+    },
+    "present.8.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 402653184
+    },
+    "present.8.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 427819008
+    },
+    "past_key_values.9.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 452984832
+    },
+    "past_key_values.9.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 478150656
+    },
+    "present.9.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 452984832
+    },
+    "present.9.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 478150656
+    },
+    "past_key_values.10.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 503316480
+    },
+    "past_key_values.10.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 528482304
+    },
+    "present.10.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 503316480
+    },
+    "present.10.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 528482304
+    },
+    "past_key_values.11.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 553648128
+    },
+    "past_key_values.11.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 578813952
+    },
+    "present.11.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 553648128
+    },
+    "present.11.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 578813952
+    },
+    "past_key_values.12.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 603979776
+    },
+    "past_key_values.12.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 629145600
+    },
+    "present.12.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 603979776
+    },
+    "present.12.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 629145600
+    },
+    "past_key_values.13.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 654311424
+    },
+    "past_key_values.13.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 679477248
+    },
+    "present.13.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 654311424
+    },
+    "present.13.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 679477248
+    },
+    "past_key_values.14.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 704643072
+    },
+    "past_key_values.14.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 729808896
+    },
+    "present.14.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 704643072
+    },
+    "present.14.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 729808896
+    },
+    "past_key_values.15.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 754974720
+    },
+    "past_key_values.15.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 780140544
+    },
+    "present.15.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 754974720
+    },
+    "present.15.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 780140544
+    },
+    "past_key_values.16.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 805306368
+    },
+    "past_key_values.16.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 830472192
+    },
+    "present.16.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 805306368
+    },
+    "present.16.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 830472192
+    },
+    "past_key_values.17.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 855638016
+    },
+    "past_key_values.17.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 880803840
+    },
+    "present.17.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 855638016
+    },
+    "present.17.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 880803840
+    },
+    "past_key_values.18.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 905969664
+    },
+    "past_key_values.18.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 931135488
+    },
+    "present.18.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 905969664
+    },
+    "present.18.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 931135488
+    },
+    "past_key_values.19.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 956301312
+    },
+    "past_key_values.19.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 981467136
+    },
+    "present.19.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 956301312
+    },
+    "present.19.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 981467136
+    },
+    "past_key_values.20.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1006632960
+    },
+    "past_key_values.20.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1031798784
+    },
+    "present.20.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1006632960
+    },
+    "present.20.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1031798784
+    },
+    "past_key_values.21.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1056964608
+    },
+    "past_key_values.21.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1082130432
+    },
+    "present.21.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1056964608
+    },
+    "present.21.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1082130432
+    },
+    "past_key_values.22.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1107296256
+    },
+    "past_key_values.22.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1132462080
+    },
+    "present.22.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1107296256
+    },
+    "present.22.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1132462080
+    },
+    "past_key_values.23.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1157627904
+    },
+    "past_key_values.23.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1182793728
+    },
+    "present.23.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1157627904
+    },
+    "present.23.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1182793728
+    },
+    "past_key_values.24.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1207959552
+    },
+    "past_key_values.24.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1233125376
+    },
+    "present.24.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1207959552
+    },
+    "present.24.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1233125376
+    },
+    "past_key_values.25.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1258291200
+    },
+    "past_key_values.25.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1283457024
+    },
+    "present.25.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1258291200
+    },
+    "present.25.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1283457024
+    },
+    "past_key_values.26.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1308622848
+    },
+    "past_key_values.26.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1333788672
+    },
+    "present.26.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1308622848
+    },
+    "present.26.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1333788672
+    },
+    "past_key_values.27.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1358954496
+    },
+    "past_key_values.27.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1384120320
+    },
+    "present.27.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1358954496
+    },
+    "present.27.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1384120320
+    },
+    "past_key_values.28.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1409286144
+    },
+    "past_key_values.28.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1434451968
+    },
+    "present.28.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1409286144
+    },
+    "present.28.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1434451968
+    },
+    "past_key_values.29.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1459617792
+    },
+    "past_key_values.29.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1484783616
+    },
+    "present.29.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1459617792
+    },
+    "present.29.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1484783616
+    },
+    "past_key_values.30.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1509949440
+    },
+    "past_key_values.30.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1535115264
+    },
+    "present.30.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1509949440
+    },
+    "present.30.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1535115264
+    },
+    "past_key_values.31.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1560281088
+    },
+    "past_key_values.31.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1585446912
+    },
+    "present.31.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1560281088
+    },
+    "present.31.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        32,
+        4096,
+        96
+      ],
+      "size_in_bytes": 25165824,
+      "op_tensor_size": 25165824,
+      "offset": 1585446912
+    },
+    "sin_cos_cache_token": {
+      "packed_buffer_label": "ext_buf_1",
+      "xrt_arg_id": 6,
+      "dtype": "bfloat16",
+      "shape": [
+        135168,
+        96
+      ],
+      "size_in_bytes": 25952256,
+      "op_tensor_size": 25952256,
+      "offset": 0
+    }
+  },
+  "aux_info": {
+    "is_llm": true
+  }
+}
\ No newline at end of file