wikeeyang commited on
Commit
56d9e9b
·
verified ·
1 Parent(s): ee9d5b2

Upload load_quantized_model.py

Browse files
Files changed (1) hide show
  1. load_quantized_model.py +66 -0
load_quantized_model.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # load_quantized_model.py
2
+ import json
3
+ import torch
4
+ from safetensors.torch import load_file
5
+ from optimum.quanto import requantize, quantize, qint4
6
+ from hunyuan_image_3.hunyuan import HunyuanImage3ForCausalMM
7
+ from transformers import AutoConfig, QuantoConfig
8
+ from transformers.generation.utils import GenerationConfig
9
+
10
+
11
+ def load_quantized_hi3_m1(model_path):
12
+ print(f"Loading model architecture from {model_path} to CPU...")
13
+ Qmodel = HunyuanImage3ForCausalMM.from_pretrained(
14
+ model_path,
15
+ dtype=torch.bfloat16,
16
+ device_map=None,
17
+ attn_implementation="sdpa",
18
+ moe_impl="eager",
19
+ moe_drop_tokens=True,
20
+ trust_remote_code=True,
21
+ low_cpu_mem_usage=False,
22
+ )
23
+
24
+ print("Applying int4 quantization structure...")
25
+ quantize(Qmodel, weights=qint4)
26
+
27
+ print("Loading quantized weights...")
28
+ state_dict = load_file(f"{model_path}/model.safetensors")
29
+ Qmodel.load_state_dict(state_dict, strict=False, assign=True)
30
+
31
+ print("Moving quantized model to GPU...")
32
+ Qmodel = Qmodel.to("cuda")
33
+
34
+ return Qmodel
35
+
36
+
37
+ def load_quantized_hi3_m2(model_path):
38
+
39
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
40
+
41
+ state_dict = load_file(f"{model_path}/model.safetensors")
42
+ with open(f"{model_path}/quantization_map.json", "r") as f: quantization_map = json.load(f)
43
+
44
+ print("Create Meta model and Loading quantized weights to CPU...")
45
+ with torch.device('meta'): Qmodel = HunyuanImage3ForCausalMM(config)
46
+ Qmodel = Qmodel.to(torch.bfloat16)
47
+ requantize(Qmodel, state_dict, quantization_map, device=torch.device('cpu'))
48
+
49
+ generation_config = GenerationConfig.from_pretrained(model_path)
50
+ Qmodel.generation_config = generation_config
51
+
52
+ print("Moving quantized model to GPU...")
53
+ Qmodel = Qmodel.to(torch.device('cuda'))
54
+
55
+ return Qmodel
56
+
57
+
58
+ # modify your "app/pipeline.py" script as below:
59
+ # from load_quantized_model import load_quantized_hi3_m1, load_quantized_hi3_m2
60
+
61
+ # replace:
62
+ # self.model = HunyuanImage3ForCausalMM.from_pretrained(args.model_id, **kwargs)
63
+ # with:
64
+ # self.model = load_quantized_hi3_m1(args.model_id)
65
+ # or with:
66
+ # self.model = load_quantized_hi3_m2(args.model_id)