forked from intel-analytics/ipex-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
quantize.py
136 lines (125 loc) · 6 KB
/
quantize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import subprocess
from ipex_llm.utils.common import invalidInputError
import platform
from pathlib import Path
dirname, _ = os.path.split(os.path.abspath(__file__))
libs_dirname = os.path.dirname(dirname)
# ggml quantized tensor type, this is different from below file quantized type(_quantize_type)
ggml_tensor_qtype = {"sym_int4": 2, # q4_0 in ggml
"asym_int4": 3, # q4_1 in ggml
"sym_int5": 6, # q5_0 in ggml
"asym_int5": 7, # q5_1 in ggml
"sym_int8": 8, # q8_0 in ggml
"nf4": 10,
"nf3": 11,
"fp16": 12,
"fp8_e4m3": 15, # fp8 in e4m3 format
"fp4": 16,
"mixed_fp4": 17, # Mixture of Formats Quantization 4 bits
"mixed_fp8": 18, # Mixture of Formats Quantization 8 bits
"fp8_e5m2": 19, # fp8 in e5m2 format
"fp8": 19, # fp8 in e5m2 format
"bf16": 20,
"gguf_iq2_xxs": 21,
"gguf_iq2_xs": 22,
"q2_k": 23,
"gguf_iq1_s": 24,
"gguf_iq1_m": 25,
"q6_k": 26,
"q4_k": 27,
"q5_k": 28,
"fp6": 29}
# mixed precison from llama.cpp
gguf_mixed_qtype = {"gguf_q4k_s": 101,
"gguf_q4k_m": 102}
_llama_quantize_type = {"q4_0": 2,
"q4_1": 3,
"q5_0": 8,
"q5_1": 9,
"q8_0": 7}
_bloom_quantize_type = {"q4_0": 2,
"q4_1": 3}
_gptneox_quantize_type = {"q4_0": 2,
"q4_1": 3,
"q5_0": 8,
"q5_1": 9,
"q8_0": 7}
_starcoder_quantize_type = {"q4_0": 2,
"q4_1": 3,
"q5_0": 8,
"q5_1": 9,
"q8_0": 7}
_quantize_type = {"llama": _llama_quantize_type,
"bloom": _bloom_quantize_type,
"gptneox": _gptneox_quantize_type,
"starcoder": _starcoder_quantize_type}
def quantize(input_path: str, output_path: str,
model_family: str, dtype: str='q4_0'):
"""
Quantize ggml file to lower precision.
:param input_path: Path of input ggml file, for example `./ggml-model-f16.bin`.
:param output_path: Save path of output quantized model. You must pass a directory to
save all related output. Filename of quantized model will be like
`bigdl_llm_llama_q4_0.bin`.
:param model_family: Which model family your input model belongs to.
Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
:param dtype: Quantization method which differs in the resulting model disk size and
inference speed. Defalut to `q4_0`. Difference model family may support
different types, now the supported list is:
llama : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
bloom : "q4_0", "q4_1"
gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
starcoder : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
:return: the path str to the converted ggml binary checkpoint
"""
invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
"Now we only support quantization of model \
family('llama', 'bloom', 'gptneox', 'starcoder')",
"{} is not in the list.".format(model_family))
invalidInputError(os.path.isfile(input_path),
"The file {} is not found".format(input_path))
invalidInputError(os.path.isdir(output_path),
"The output_path {} is not a directory".format(output_path))
# convert quantize type str into corresponding int value
quantize_type_map = _quantize_type[model_family]
output_filename = "bigdl_llm_{}_{}.bin".format(model_family,
dtype.lower())
output_path = os.path.join(output_path, output_filename)
invalidInputError(dtype.lower() in quantize_type_map, "{0} model just accept {1} now, \
but you pass in {2}.".format(
model_family,
list(quantize_type_map.keys()),
dtype))
quantize_type = quantize_type_map[dtype]
if platform.platform().startswith('Windows'):
suffix = '.exe'
else:
suffix = ''
quantize_args = "{0}/libs/quantize-{1}{2} {3} {4} {5}".format(libs_dirname,
model_family,
suffix,
input_path,
output_path,
str(quantize_type))
p = subprocess.run(quantize_args.split(), capture_output=True)
error_message = p.stderr
invalidInputError(not p.returncode,
"Fail to quantize {}, error message is {}.".format(str(input_path),
error_message))
return str(output_path)