wikeeyang
/

Hunyuan-Image-30-Qint4

text-generation

Model card Files Files and versions

wikeeyang commited on 20 days ago

Commit

56d9e9b

·

verified ·

1 Parent(s): ee9d5b2

Upload load_quantized_model.py

Files changed (1) hide show

load_quantized_model.py +66 -0

load_quantized_model.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# load_quantized_model.py
+import json
+import torch
+from safetensors.torch import load_file
+from optimum.quanto import requantize, quantize, qint4
+from hunyuan_image_3.hunyuan import HunyuanImage3ForCausalMM
+from transformers import AutoConfig, QuantoConfig
+from transformers.generation.utils import GenerationConfig
+def load_quantized_hi3_m1(model_path):
+    print(f"Loading model architecture from {model_path} to CPU...")
+    Qmodel = HunyuanImage3ForCausalMM.from_pretrained(
+        model_path,
+        dtype=torch.bfloat16,
+        device_map=None,
+        attn_implementation="sdpa",
+        moe_impl="eager",
+        moe_drop_tokens=True,
+        trust_remote_code=True,
+        low_cpu_mem_usage=False,
+    )
+    print("Applying int4 quantization structure...")
+    quantize(Qmodel, weights=qint4)
+    print("Loading quantized weights...")
+    state_dict = load_file(f"{model_path}/model.safetensors")
+    Qmodel.load_state_dict(state_dict, strict=False, assign=True)
+    print("Moving quantized model to GPU...")
+    Qmodel = Qmodel.to("cuda")
+    return Qmodel
+def load_quantized_hi3_m2(model_path):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    state_dict = load_file(f"{model_path}/model.safetensors")
+    with open(f"{model_path}/quantization_map.json", "r") as f: quantization_map = json.load(f)
+    print("Create Meta model and Loading quantized weights to CPU...")
+    with torch.device('meta'): Qmodel = HunyuanImage3ForCausalMM(config)
+    Qmodel = Qmodel.to(torch.bfloat16)
+    requantize(Qmodel, state_dict, quantization_map, device=torch.device('cpu'))
+    generation_config = GenerationConfig.from_pretrained(model_path)
+    Qmodel.generation_config = generation_config
+    print("Moving quantized model to GPU...")
+    Qmodel = Qmodel.to(torch.device('cuda'))
+    return Qmodel
+# modify your "app/pipeline.py" script as below:
+# from load_quantized_model import load_quantized_hi3_m1, load_quantized_hi3_m2
+# replace:
+#        self.model = HunyuanImage3ForCausalMM.from_pretrained(args.model_id, **kwargs)
+# with:
+#        self.model = load_quantized_hi3_m1(args.model_id)
+# or with:
+#        self.model = load_quantized_hi3_m2(args.model_id)