44
44
45
45
from ...quantization import FORMAT , FORMAT_FIELD_JSON , GPTQ , QuantizeConfig
46
46
from ...utils .backend import BACKEND
47
+ from ...utils .exllama import exllama_set_max_input_length
47
48
from ...utils .importer import select_quant_linear
48
49
from ...utils .logger import setup_logger
49
50
from ...utils .model import convert_gptq_v1_to_v2_format , convert_gptq_v2_to_v1_format , gptqmodel_post_init
@@ -67,13 +68,15 @@ def __init__(
67
68
desc_act : bool = False ,
68
69
sym : bool = True ,
69
70
true_sequential : bool = True ,
71
+ use_cuda_fp16 : bool = False ,
70
72
model_seqlen : Optional [int ] = None ,
71
73
block_name_to_quantize : Optional [str ] = None ,
72
74
module_name_preceding_first_block : Optional [List [str ]] = None ,
73
75
batch_size : int = 1 ,
74
76
pad_token_id : Optional [int ] = None ,
75
77
use_exllama : Optional [bool ] = None ,
76
78
max_input_length : Optional [int ] = None ,
79
+ exllama_config : Optional [Dict [str , Any ]] = None ,
77
80
cache_block_outputs : bool = True ,
78
81
modules_in_block_to_quantize : Optional [List [List [str ]]] = None ,
79
82
** kwargs ,
@@ -88,13 +91,16 @@ def __init__(
88
91
self .desc_act = desc_act
89
92
self .sym = sym
90
93
self .true_sequential = true_sequential
94
+ self .use_cuda_fp16 = use_cuda_fp16
91
95
self .model_seqlen = model_seqlen
92
96
self .block_name_to_quantize = block_name_to_quantize
93
97
self .module_name_preceding_first_block = module_name_preceding_first_block
94
98
self .batch_size = batch_size
95
99
self .pad_token_id = pad_token_id
96
100
self .use_exllama = use_exllama
97
101
self .max_input_length = max_input_length
102
+ self .exllama_config = exllama_config
103
+ self .disable_exllama = kwargs .pop ("disable_exllama" , None )
98
104
self .cache_block_outputs = cache_block_outputs
99
105
self .modules_in_block_to_quantize = modules_in_block_to_quantize
100
106
self .post_init ()
@@ -104,6 +110,11 @@ def post_init(self):
104
110
raise ValueError (OPTIMUM_INSTALL_HINT )
105
111
super ().post_init ()
106
112
113
+ class ExllamaVersion (int , Enum ):
114
+ ONE = 1
115
+ TWO = 2
116
+
117
+
107
118
class GPTQModelQuantizer (object ):
108
119
r"""
109
120
A simple API for GPTQ Quantization
@@ -119,12 +130,14 @@ def __init__(
119
130
desc_act : bool = False ,
120
131
sym : bool = True ,
121
132
true_sequential : bool = True ,
133
+ use_cuda_fp16 : bool = False ,
122
134
model_seqlen : Optional [int ] = None ,
123
135
block_name_to_quantize : Optional [str ] = None ,
124
136
module_name_preceding_first_block : Optional [List [str ]] = None ,
125
137
batch_size : int = 1 ,
126
138
pad_token_id : Optional [int ] = None ,
127
- use_triton = False ,
139
+ disable_exllama : bool = False ,
140
+ exllama_config : Dict [str , Any ] = None ,
128
141
max_input_length : Optional [int ] = None ,
129
142
cache_block_outputs : Optional [bool ] = True ,
130
143
modules_in_block_to_quantize : Optional [List [List [str ]]] = None ,
@@ -153,6 +166,8 @@ def __init__(
153
166
Whether to perform sequential quantization even within a single Transformer block.
154
167
Instead of quantizing the entire block at once, we perform layer-wise quantization.
155
168
As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
169
+ use_cuda_fp16 (`bool`, defaults to `False`):
170
+ Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
156
171
model_seqlen (`Optional[int]`, defaults to `None`):
157
172
The maximum sequence length that the model can take.
158
173
block_name_to_quantize (`Optional[str]`, defaults to `None`):
@@ -163,6 +178,8 @@ def __init__(
163
178
The batch size of the dataset
164
179
pad_token_id (`Optional[int]`, defaults to `None`):
165
180
The pad token id. Needed to prepare the dataset when `batch_size` > 1.
181
+ exllama_config (`Dict[str, Any]`, *optional*):
182
+ The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
166
183
max_input_length (`Optional[int]`, defaults to `None`):
167
184
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
168
185
It is specific to the exllama backend with act-order.
@@ -178,6 +195,9 @@ def __init__(
178
195
if not OPTIMUM_AVAILABLE :
179
196
raise ValueError (OPTIMUM_INSTALL_HINT )
180
197
198
+ if disable_exllama :
199
+ logger .warning ("gptqmodel does not support parameter: disable_exllama=True. Setting `disable_exllama=False." )
200
+
181
201
self .bits = bits
182
202
self .dataset = dataset
183
203
self .group_size = group_size
@@ -186,12 +206,13 @@ def __init__(
186
206
self .desc_act = desc_act
187
207
self .sym = sym
188
208
self .true_sequential = true_sequential
209
+ self .use_cuda_fp16 = use_cuda_fp16
189
210
self .model_seqlen = model_seqlen
190
211
self .block_name_to_quantize = block_name_to_quantize
191
212
self .module_name_preceding_first_block = module_name_preceding_first_block
192
213
self .batch_size = batch_size
193
214
self .pad_token_id = pad_token_id
194
- self .use_triton = use_triton
215
+ self .exllama_config = exllama_config
195
216
self .max_input_length = max_input_length
196
217
self .quant_method = QuantizationMethod .GPTQ
197
218
self .cache_block_outputs = cache_block_outputs
@@ -230,6 +251,18 @@ def __init__(
230
251
if self .damp_auto_increment < 0 :
231
252
raise ValueError ("damp_auto_increment must greater than 0." )
232
253
254
+ if self .exllama_config is None :
255
+ self .exllama_config = {"version" : ExllamaVersion .TWO }
256
+ else :
257
+ if "version" not in self .exllama_config :
258
+ raise ValueError ("`exllama_config` needs to have a `version` key" )
259
+ elif self .exllama_config ["version" ] not in [ExllamaVersion .ONE , ExllamaVersion .TWO ]:
260
+ version = self .exllama_config ["version" ]
261
+ raise ValueError (
262
+ f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version { version } "
263
+ )
264
+ self .exllama_version = self .exllama_config ["version" ]
265
+
233
266
def to_dict (self ):
234
267
"""
235
268
Returns the args in dict format.
@@ -419,6 +452,9 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
419
452
else :
420
453
has_device_map = False
421
454
455
+ if hasattr (model , "dtype" ):
456
+ self .use_cuda_fp16 = model .dtype == torch .float16
457
+
422
458
if self .model_seqlen is None :
423
459
# We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
424
460
self .model_seqlen = min (4028 , get_seqlen (model ))
@@ -607,7 +643,15 @@ def tmp(_, input, output):
607
643
torch .cuda .empty_cache ()
608
644
609
645
if self .bits == 4 :
610
- self .use_triton = True
646
+ # device not on gpu
647
+ if device == torch .device ("cpu" ) or (has_device_map and any (d in devices for d in ["cpu" , "disk" ])):
648
+ raise ValueError ("Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU." )
649
+ elif self .exllama_version == ExllamaVersion .TWO :
650
+ logger .warning (
651
+ "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
652
+ "Setting `exllama_version=ExllamaVersion.ONE`. You should only use Exllamav2 backend for inference. "
653
+ )
654
+ self .exllama_version = ExllamaVersion .ONE
611
655
# Step 4: Pack the model at the end (Replacing the layers)
612
656
self .pack_model (model = model , quantizers = quantizers )
613
657
@@ -645,6 +689,12 @@ class StoreAttr(object):
645
689
model .quantize_config = StoreAttr ()
646
690
model .quantize_config .desc_act = self .desc_act
647
691
model = gptqmodel_post_init (model , use_act_order = self .desc_act )
692
+ if (
693
+ self .desc_act
694
+ and self .exllama_version == ExllamaVersion .ONE
695
+ and self .max_input_length is not None
696
+ ):
697
+ model = exllama_set_max_input_length (model , self .max_input_length )
648
698
return model
649
699
650
700
def pack_model (
@@ -680,10 +730,13 @@ def pack_model(
680
730
logger .info ("Model packed." )
681
731
682
732
def select_quantlinear (self ):
683
- if self .use_triton :
684
- backend = BACKEND .TRITON
685
- else :
733
+ if self .exllama_version == ExllamaVersion . ONE :
734
+ backend = BACKEND .EXLLAMA
735
+ elif self . exllama_version == ExllamaVersion . TWO :
686
736
backend = BACKEND .EXLLAMA_V2
737
+ else :
738
+ backend = BACKEND .AUTO
739
+
687
740
QuantLinear = select_quant_linear (
688
741
sym = self .sym ,
689
742
desc_act = self .desc_act ,
@@ -735,6 +788,7 @@ def load_quantized_model(
735
788
offload_folder : Optional [str ] = None ,
736
789
offload_buffers : Optional [str ] = None ,
737
790
offload_state_dict : bool = False ,
791
+ exllama_config : Optional [Dict [str , Any ]] = None ,
738
792
max_input_length : Optional [int ] = None ,
739
793
):
740
794
"""
@@ -768,6 +822,8 @@ def load_quantized_model(
768
822
If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
769
823
the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
770
824
picked contains `"disk"` values.
825
+ exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
826
+ The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
771
827
max_input_length (`Optional[int]`, defaults to `None`):
772
828
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
773
829
It is specific to the exllama backend with act-order.
@@ -790,6 +846,17 @@ def load_quantized_model(
790
846
device_map = {"" : torch .cuda .current_device ()}
791
847
logger .info ("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`." )
792
848
849
+ if exllama_config is None :
850
+ exllama_config = {"version" : ExllamaVersion .TWO }
851
+ else :
852
+ if "version" not in exllama_config :
853
+ raise ValueError ("`exllama_config` needs to have a `version` key" )
854
+ elif exllama_config ["version" ] not in [ExllamaVersion .ONE , ExllamaVersion .TWO ]:
855
+ version = exllama_config ["version" ]
856
+ raise ValueError (
857
+ f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version { version } "
858
+ )
859
+
793
860
# this branch will check if model is from huggingface
794
861
try :
795
862
if hasattr (model , "config" ) and hasattr (model .config , "quantization_config" ):
@@ -802,6 +869,8 @@ def load_quantized_model(
802
869
f"Failed to load quantization config from { save_folder } (lookup for traceback): { err } \n Tip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
803
870
) from err
804
871
quantizer = GPTQModelQuantizer .from_dict (quantize_config_dict )
872
+ quantizer .exllama_config = exllama_config
873
+ quantizer .exllama_version = quantizer .exllama_config ["version" ]
805
874
quantizer .max_input_length = max_input_length
806
875
807
876
model = quantizer .convert_model (model )
0 commit comments