Skip to content

Commit 414bdce

Browse files
Revert "remove exllama v1 (ModelCloud#453)"
This reverts commit 5adae36
1 parent cd9326d commit 414bdce

File tree

13 files changed

+1715
-1072
lines changed

13 files changed

+1715
-1072
lines changed

gptqmodel/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .models import GPTQModel
22
from .quantization import BaseQuantizeConfig, QuantizeConfig
33
from .utils import BACKEND, get_backend
4+
from .utils.exllama import exllama_set_max_input_length
45
from .version import __version__

gptqmodel/integration/optimum/quantizer.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444

4545
from ...quantization import FORMAT, FORMAT_FIELD_JSON, GPTQ, QuantizeConfig
4646
from ...utils.backend import BACKEND
47+
from ...utils.exllama import exllama_set_max_input_length
4748
from ...utils.importer import select_quant_linear
4849
from ...utils.logger import setup_logger
4950
from ...utils.model import convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, gptqmodel_post_init
@@ -67,13 +68,15 @@ def __init__(
6768
desc_act: bool = False,
6869
sym: bool = True,
6970
true_sequential: bool = True,
71+
use_cuda_fp16: bool = False,
7072
model_seqlen: Optional[int] = None,
7173
block_name_to_quantize: Optional[str] = None,
7274
module_name_preceding_first_block: Optional[List[str]] = None,
7375
batch_size: int = 1,
7476
pad_token_id: Optional[int] = None,
7577
use_exllama: Optional[bool] = None,
7678
max_input_length: Optional[int] = None,
79+
exllama_config: Optional[Dict[str, Any]] = None,
7780
cache_block_outputs: bool = True,
7881
modules_in_block_to_quantize: Optional[List[List[str]]] = None,
7982
**kwargs,
@@ -88,13 +91,16 @@ def __init__(
8891
self.desc_act = desc_act
8992
self.sym = sym
9093
self.true_sequential = true_sequential
94+
self.use_cuda_fp16 = use_cuda_fp16
9195
self.model_seqlen = model_seqlen
9296
self.block_name_to_quantize = block_name_to_quantize
9397
self.module_name_preceding_first_block = module_name_preceding_first_block
9498
self.batch_size = batch_size
9599
self.pad_token_id = pad_token_id
96100
self.use_exllama = use_exllama
97101
self.max_input_length = max_input_length
102+
self.exllama_config = exllama_config
103+
self.disable_exllama = kwargs.pop("disable_exllama", None)
98104
self.cache_block_outputs = cache_block_outputs
99105
self.modules_in_block_to_quantize = modules_in_block_to_quantize
100106
self.post_init()
@@ -104,6 +110,11 @@ def post_init(self):
104110
raise ValueError(OPTIMUM_INSTALL_HINT)
105111
super().post_init()
106112

113+
class ExllamaVersion(int, Enum):
114+
ONE = 1
115+
TWO = 2
116+
117+
107118
class GPTQModelQuantizer(object):
108119
r"""
109120
A simple API for GPTQ Quantization
@@ -119,12 +130,14 @@ def __init__(
119130
desc_act: bool = False,
120131
sym: bool = True,
121132
true_sequential: bool = True,
133+
use_cuda_fp16: bool = False,
122134
model_seqlen: Optional[int] = None,
123135
block_name_to_quantize: Optional[str] = None,
124136
module_name_preceding_first_block: Optional[List[str]] = None,
125137
batch_size: int = 1,
126138
pad_token_id: Optional[int] = None,
127-
use_triton = False,
139+
disable_exllama: bool = False,
140+
exllama_config: Dict[str, Any] = None,
128141
max_input_length: Optional[int] = None,
129142
cache_block_outputs: Optional[bool] = True,
130143
modules_in_block_to_quantize: Optional[List[List[str]]] = None,
@@ -153,6 +166,8 @@ def __init__(
153166
Whether to perform sequential quantization even within a single Transformer block.
154167
Instead of quantizing the entire block at once, we perform layer-wise quantization.
155168
As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
169+
use_cuda_fp16 (`bool`, defaults to `False`):
170+
Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
156171
model_seqlen (`Optional[int]`, defaults to `None`):
157172
The maximum sequence length that the model can take.
158173
block_name_to_quantize (`Optional[str]`, defaults to `None`):
@@ -163,6 +178,8 @@ def __init__(
163178
The batch size of the dataset
164179
pad_token_id (`Optional[int]`, defaults to `None`):
165180
The pad token id. Needed to prepare the dataset when `batch_size` > 1.
181+
exllama_config (`Dict[str, Any]`, *optional*):
182+
The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
166183
max_input_length (`Optional[int]`, defaults to `None`):
167184
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
168185
It is specific to the exllama backend with act-order.
@@ -178,6 +195,9 @@ def __init__(
178195
if not OPTIMUM_AVAILABLE:
179196
raise ValueError(OPTIMUM_INSTALL_HINT)
180197

198+
if disable_exllama:
199+
logger.warning("gptqmodel does not support parameter: disable_exllama=True. Setting `disable_exllama=False.")
200+
181201
self.bits = bits
182202
self.dataset = dataset
183203
self.group_size = group_size
@@ -186,12 +206,13 @@ def __init__(
186206
self.desc_act = desc_act
187207
self.sym = sym
188208
self.true_sequential = true_sequential
209+
self.use_cuda_fp16 = use_cuda_fp16
189210
self.model_seqlen = model_seqlen
190211
self.block_name_to_quantize = block_name_to_quantize
191212
self.module_name_preceding_first_block = module_name_preceding_first_block
192213
self.batch_size = batch_size
193214
self.pad_token_id = pad_token_id
194-
self.use_triton = use_triton
215+
self.exllama_config = exllama_config
195216
self.max_input_length = max_input_length
196217
self.quant_method = QuantizationMethod.GPTQ
197218
self.cache_block_outputs = cache_block_outputs
@@ -230,6 +251,18 @@ def __init__(
230251
if self.damp_auto_increment < 0:
231252
raise ValueError("damp_auto_increment must greater than 0.")
232253

254+
if self.exllama_config is None:
255+
self.exllama_config = {"version": ExllamaVersion.TWO}
256+
else:
257+
if "version" not in self.exllama_config:
258+
raise ValueError("`exllama_config` needs to have a `version` key")
259+
elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
260+
version = self.exllama_config["version"]
261+
raise ValueError(
262+
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
263+
)
264+
self.exllama_version = self.exllama_config["version"]
265+
233266
def to_dict(self):
234267
"""
235268
Returns the args in dict format.
@@ -419,6 +452,9 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
419452
else:
420453
has_device_map = False
421454

455+
if hasattr(model, "dtype"):
456+
self.use_cuda_fp16 = model.dtype == torch.float16
457+
422458
if self.model_seqlen is None:
423459
# We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
424460
self.model_seqlen = min(4028, get_seqlen(model))
@@ -607,7 +643,15 @@ def tmp(_, input, output):
607643
torch.cuda.empty_cache()
608644

609645
if self.bits == 4:
610-
self.use_triton = True
646+
# device not on gpu
647+
if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
648+
raise ValueError("Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU.")
649+
elif self.exllama_version == ExllamaVersion.TWO:
650+
logger.warning(
651+
"Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
652+
"Setting `exllama_version=ExllamaVersion.ONE`. You should only use Exllamav2 backend for inference. "
653+
)
654+
self.exllama_version = ExllamaVersion.ONE
611655
# Step 4: Pack the model at the end (Replacing the layers)
612656
self.pack_model(model=model, quantizers=quantizers)
613657

@@ -645,6 +689,12 @@ class StoreAttr(object):
645689
model.quantize_config = StoreAttr()
646690
model.quantize_config.desc_act = self.desc_act
647691
model = gptqmodel_post_init(model, use_act_order=self.desc_act)
692+
if (
693+
self.desc_act
694+
and self.exllama_version == ExllamaVersion.ONE
695+
and self.max_input_length is not None
696+
):
697+
model = exllama_set_max_input_length(model, self.max_input_length)
648698
return model
649699

650700
def pack_model(
@@ -680,10 +730,13 @@ def pack_model(
680730
logger.info("Model packed.")
681731

682732
def select_quantlinear(self):
683-
if self.use_triton:
684-
backend = BACKEND.TRITON
685-
else:
733+
if self.exllama_version == ExllamaVersion.ONE:
734+
backend = BACKEND.EXLLAMA
735+
elif self.exllama_version == ExllamaVersion.TWO:
686736
backend = BACKEND.EXLLAMA_V2
737+
else:
738+
backend = BACKEND.AUTO
739+
687740
QuantLinear = select_quant_linear(
688741
sym=self.sym,
689742
desc_act=self.desc_act,
@@ -735,6 +788,7 @@ def load_quantized_model(
735788
offload_folder: Optional[str] = None,
736789
offload_buffers: Optional[str] = None,
737790
offload_state_dict: bool = False,
791+
exllama_config: Optional[Dict[str, Any]] = None,
738792
max_input_length: Optional[int] = None,
739793
):
740794
"""
@@ -768,6 +822,8 @@ def load_quantized_model(
768822
If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
769823
the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
770824
picked contains `"disk"` values.
825+
exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
826+
The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
771827
max_input_length (`Optional[int]`, defaults to `None`):
772828
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
773829
It is specific to the exllama backend with act-order.
@@ -790,6 +846,17 @@ def load_quantized_model(
790846
device_map = {"": torch.cuda.current_device()}
791847
logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
792848

849+
if exllama_config is None:
850+
exllama_config = {"version": ExllamaVersion.TWO}
851+
else:
852+
if "version" not in exllama_config:
853+
raise ValueError("`exllama_config` needs to have a `version` key")
854+
elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
855+
version = exllama_config["version"]
856+
raise ValueError(
857+
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
858+
)
859+
793860
# this branch will check if model is from huggingface
794861
try:
795862
if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
@@ -802,6 +869,8 @@ def load_quantized_model(
802869
f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
803870
) from err
804871
quantizer = GPTQModelQuantizer.from_dict(quantize_config_dict)
872+
quantizer.exllama_config = exllama_config
873+
quantizer.exllama_version = quantizer.exllama_config["version"]
805874
quantizer.max_input_length = max_input_length
806875

807876
model = quantizer.convert_model(model)

gptqmodel/integration/optimum/utils.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from transformers.modeling_utils import PreTrainedModel
2020
from transformers.pytorch_utils import Conv1D
2121

22-
from ...utils.logger import setup_logger
2322
from .constants import BLOCK_PATTERNS, SEQLEN_KEYS_TRANFORMERS
23+
from ...utils.logger import setup_logger
2424

2525
ori_save_pretrained = PreTrainedModel.save_pretrained
2626

@@ -126,6 +126,7 @@ def post_init(self):
126126
"""
127127
import importlib
128128

129+
from transformers.utils.quantization_config import ExllamaVersion
129130
from packaging import version
130131
print("monkey patch postin")
131132
if self.bits not in [2, 3, 4, 8]:
@@ -152,19 +153,47 @@ def post_init(self):
152153
['wikitext2','c4','c4-new'], but we found {self.dataset}"""
153154
)
154155

155-
if self.use_exllama is None:
156+
if self.disable_exllama is None and self.use_exllama is None:
156157
# New default behaviour
157158
self.use_exllama = True
159+
elif self.disable_exllama is not None and self.use_exllama is None:
160+
# Follow pattern of old config
161+
logger.warning(
162+
"Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`."
163+
"The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file."
164+
)
165+
self.use_exllama = not self.disable_exllama
166+
self.disable_exllama = None
167+
elif self.disable_exllama is not None and self.use_exllama is not None:
168+
# Only happens if user explicitly passes in both arguments
169+
raise ValueError("Cannot specify both `disable_exllama` and `use_exllama`. Please use just `use_exllama`")
170+
171+
if self.exllama_config is None:
172+
self.exllama_config = {"version": ExllamaVersion.ONE}
173+
else:
174+
if "version" not in self.exllama_config:
175+
raise ValueError("`exllama_config` needs to have a `version` key.")
176+
elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
177+
exllama_version = self.exllama_config["version"]
178+
raise ValueError(
179+
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
180+
)
158181

159182
if self.bits == 4 and self.use_exllama:
160-
optimum_version = version.parse(importlib.metadata.version("optimum"))
161-
# autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
162-
# if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
163-
if optimum_version <= version.parse("1.13.2"):
164-
raise ValueError(
165-
# f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
166-
f"You need optimum > 1.13.2 . Make sure to have that version installed - detected version : optimum {optimum_version}"
183+
if self.exllama_config["version"] == ExllamaVersion.ONE:
184+
logger.info(
185+
"You have activated exllama backend. Note that you can get better inference "
186+
"speed using exllamav2 kernel by setting `exllama_config`."
167187
)
188+
elif self.exllama_config["version"] == ExllamaVersion.TWO:
189+
optimum_version = version.parse(importlib.metadata.version("optimum"))
190+
# autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
191+
# if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
192+
if optimum_version <= version.parse("1.13.2"):
193+
raise ValueError(
194+
# f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
195+
f"You need optimum > 1.13.2 . Make sure to have that version installed - detected version : optimum {optimum_version}"
196+
)
168197
if self.modules_in_block_to_quantize is not None:
169198
optimum_version = version.parse(importlib.metadata.version("optimum"))
170199
if optimum_version < version.parse("1.15.0"):

0 commit comments

Comments
 (0)