CSY-ModelCloud
diff --git a/‎gptqmodel/__init__.py
Lines changed: 1 addition & 0 deletions b/‎gptqmodel/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎gptqmodel/integration/optimum/quantizer.py
Lines changed: 75 additions & 6 deletions b/‎gptqmodel/integration/optimum/quantizer.py
Lines changed: 75 additions & 6 deletions
diff --git a/‎gptqmodel/integration/optimum/utils.py
Lines changed: 38 additions & 9 deletions b/‎gptqmodel/integration/optimum/utils.py
Lines changed: 38 additions & 9 deletions
@@ -1,4 +1,5 @@
 from .models import GPTQModel
 from .quantization import BaseQuantizeConfig, QuantizeConfig
 from .utils import BACKEND, get_backend
+from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
@@ -44,6 +44,7 @@
 
 from ...quantization import FORMAT, FORMAT_FIELD_JSON, GPTQ, QuantizeConfig
 from ...utils.backend import BACKEND
+from ...utils.exllama import exllama_set_max_input_length
 from ...utils.importer import select_quant_linear
 from ...utils.logger import setup_logger
 from ...utils.model import convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, gptqmodel_post_init
@@ -67,13 +68,15 @@ def __init__(
             desc_act: bool = False,
             sym: bool = True,
             true_sequential: bool = True,
+            use_cuda_fp16: bool = False,
             model_seqlen: Optional[int] = None,
             block_name_to_quantize: Optional[str] = None,
             module_name_preceding_first_block: Optional[List[str]] = None,
             batch_size: int = 1,
             pad_token_id: Optional[int] = None,
             use_exllama: Optional[bool] = None,
             max_input_length: Optional[int] = None,
+            exllama_config: Optional[Dict[str, Any]] = None,
             cache_block_outputs: bool = True,
             modules_in_block_to_quantize: Optional[List[List[str]]] = None,
             **kwargs,
@@ -88,13 +91,16 @@ def __init__(
         self.desc_act = desc_act
         self.sym = sym
         self.true_sequential = true_sequential
+        self.use_cuda_fp16 = use_cuda_fp16
         self.model_seqlen = model_seqlen
         self.block_name_to_quantize = block_name_to_quantize
         self.module_name_preceding_first_block = module_name_preceding_first_block
         self.batch_size = batch_size
         self.pad_token_id = pad_token_id
         self.use_exllama = use_exllama
         self.max_input_length = max_input_length
+        self.exllama_config = exllama_config
+        self.disable_exllama = kwargs.pop("disable_exllama", None)
         self.cache_block_outputs = cache_block_outputs
         self.modules_in_block_to_quantize = modules_in_block_to_quantize
         self.post_init()
@@ -104,6 +110,11 @@ def post_init(self):
             raise ValueError(OPTIMUM_INSTALL_HINT)
         super().post_init()
 
+class ExllamaVersion(int, Enum):
+    ONE = 1
+    TWO = 2
+
+
 class GPTQModelQuantizer(object):
     r"""
     A simple API for GPTQ Quantization
@@ -119,12 +130,14 @@ def __init__(
         desc_act: bool = False,
         sym: bool = True,
         true_sequential: bool = True,
+        use_cuda_fp16: bool = False,
         model_seqlen: Optional[int] = None,
         block_name_to_quantize: Optional[str] = None,
         module_name_preceding_first_block: Optional[List[str]] = None,
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
-        use_triton = False,
+        disable_exllama: bool = False,
+        exllama_config: Dict[str, Any] = None,
         max_input_length: Optional[int] = None,
         cache_block_outputs: Optional[bool] = True,
         modules_in_block_to_quantize: Optional[List[List[str]]] = None,
@@ -153,6 +166,8 @@ def __init__(
                 Whether to perform sequential quantization even within a single Transformer block.
                 Instead of quantizing the entire block at once, we perform layer-wise quantization.
                 As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
+            use_cuda_fp16 (`bool`, defaults to `False`):
+                Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
             model_seqlen (`Optional[int]`, defaults to `None`):
                 The maximum sequence length that the model can take.
             block_name_to_quantize (`Optional[str]`, defaults to `None`):
@@ -163,6 +178,8 @@ def __init__(
                 The batch size of the dataset
             pad_token_id (`Optional[int]`, defaults to `None`):
                 The pad token id. Needed to prepare the dataset when `batch_size` > 1.
+            exllama_config (`Dict[str, Any]`, *optional*):
+                The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
             max_input_length (`Optional[int]`, defaults to `None`):
                 The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
                 It is specific to the exllama backend with act-order.
@@ -178,6 +195,9 @@ def __init__(
         if not OPTIMUM_AVAILABLE:
             raise ValueError(OPTIMUM_INSTALL_HINT)
 
+        if disable_exllama:
+            logger.warning("gptqmodel does not support parameter: disable_exllama=True. Setting `disable_exllama=False.")
+
         self.bits = bits
         self.dataset = dataset
         self.group_size = group_size
@@ -186,12 +206,13 @@ def __init__(
         self.desc_act = desc_act
         self.sym = sym
         self.true_sequential = true_sequential
+        self.use_cuda_fp16 = use_cuda_fp16
         self.model_seqlen = model_seqlen
         self.block_name_to_quantize = block_name_to_quantize
         self.module_name_preceding_first_block = module_name_preceding_first_block
         self.batch_size = batch_size
         self.pad_token_id = pad_token_id
-        self.use_triton = use_triton
+        self.exllama_config = exllama_config
         self.max_input_length = max_input_length
         self.quant_method = QuantizationMethod.GPTQ
         self.cache_block_outputs = cache_block_outputs
@@ -230,6 +251,18 @@ def __init__(
         if self.damp_auto_increment < 0:
             raise ValueError("damp_auto_increment must greater than 0.")
 
+        if self.exllama_config is None:
+            self.exllama_config = {"version": ExllamaVersion.TWO}
+        else:
+            if "version" not in self.exllama_config:
+                raise ValueError("`exllama_config` needs to have a `version` key")
+            elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+                version = self.exllama_config["version"]
+                raise ValueError(
+                    f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+                )
+        self.exllama_version = self.exllama_config["version"]
+
     def to_dict(self):
         """
         Returns the args in dict format.
@@ -419,6 +452,9 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
                 else:
                     has_device_map = False
 
+        if hasattr(model, "dtype"):
+            self.use_cuda_fp16 = model.dtype == torch.float16
+
         if self.model_seqlen is None:
             # We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step
             self.model_seqlen = min(4028, get_seqlen(model))
@@ -607,7 +643,15 @@ def tmp(_, input, output):
             torch.cuda.empty_cache()
 
         if self.bits == 4:
-            self.use_triton = True
+            # device not on gpu
+            if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
+                raise ValueError("Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU.")
+            elif self.exllama_version == ExllamaVersion.TWO:
+                logger.warning(
+                    "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
+                    "Setting `exllama_version=ExllamaVersion.ONE`. You should only use Exllamav2 backend for inference. "
+                )
+                self.exllama_version = ExllamaVersion.ONE
         # Step 4: Pack the model at the end (Replacing the layers)
         self.pack_model(model=model, quantizers=quantizers)
 
@@ -645,6 +689,12 @@ class StoreAttr(object):
         model.quantize_config = StoreAttr()
         model.quantize_config.desc_act = self.desc_act
         model = gptqmodel_post_init(model, use_act_order=self.desc_act)
+        if (
+            self.desc_act
+            and self.exllama_version == ExllamaVersion.ONE
+            and self.max_input_length is not None
+        ):
+            model = exllama_set_max_input_length(model, self.max_input_length)
         return model
 
     def pack_model(
@@ -680,10 +730,13 @@ def pack_model(
         logger.info("Model packed.")
 
     def select_quantlinear(self):
-        if self.use_triton:
-            backend = BACKEND.TRITON
-        else:
+        if self.exllama_version == ExllamaVersion.ONE:
+            backend = BACKEND.EXLLAMA
+        elif self.exllama_version == ExllamaVersion.TWO:
             backend = BACKEND.EXLLAMA_V2
+        else:
+            backend = BACKEND.AUTO
+
         QuantLinear = select_quant_linear(
             sym=self.sym,
             desc_act=self.desc_act,
@@ -735,6 +788,7 @@ def load_quantized_model(
     offload_folder: Optional[str] = None,
     offload_buffers: Optional[str] = None,
     offload_state_dict: bool = False,
+    exllama_config: Optional[Dict[str, Any]] = None,
     max_input_length: Optional[int] = None,
 ):
     """
@@ -768,6 +822,8 @@ def load_quantized_model(
             If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
             the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
             picked contains `"disk"` values.
+        exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
+            The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
         max_input_length (`Optional[int]`, defaults to `None`):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
             It is specific to the exllama backend with act-order.
@@ -790,6 +846,17 @@ def load_quantized_model(
         device_map = {"": torch.cuda.current_device()}
         logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
 
+    if exllama_config is None:
+        exllama_config = {"version": ExllamaVersion.TWO}
+    else:
+        if "version" not in exllama_config:
+            raise ValueError("`exllama_config` needs to have a `version` key")
+        elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+            version = exllama_config["version"]
+            raise ValueError(
+                f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+            )
+
     # this branch will check if model is from huggingface
     try:
         if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
@@ -802,6 +869,8 @@ def load_quantized_model(
             f"Failed to load quantization config from {save_folder} (lookup for traceback): {err}\nTip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key."
         ) from err
     quantizer = GPTQModelQuantizer.from_dict(quantize_config_dict)
+    quantizer.exllama_config = exllama_config
+    quantizer.exllama_version = quantizer.exllama_config["version"]
     quantizer.max_input_length = max_input_length
 
     model = quantizer.convert_model(model)
 
@@ -19,8 +19,8 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import Conv1D
 
-from ...utils.logger import setup_logger
 from .constants import BLOCK_PATTERNS, SEQLEN_KEYS_TRANFORMERS
+from ...utils.logger import setup_logger
 
 ori_save_pretrained = PreTrainedModel.save_pretrained
 
@@ -126,6 +126,7 @@ def post_init(self):
         """
         import importlib
 
+        from transformers.utils.quantization_config import ExllamaVersion
         from packaging import version
         print("monkey patch postin")
         if self.bits not in [2, 3, 4, 8]:
@@ -152,19 +153,47 @@ def post_init(self):
                     ['wikitext2','c4','c4-new'], but we found {self.dataset}"""
                 )
 
-        if self.use_exllama is None:
+        if self.disable_exllama is None and self.use_exllama is None:
             # New default behaviour
             self.use_exllama = True
+        elif self.disable_exllama is not None and self.use_exllama is None:
+            # Follow pattern of old config
+            logger.warning(
+                "Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`."
+                "The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file."
+            )
+            self.use_exllama = not self.disable_exllama
+            self.disable_exllama = None
+        elif self.disable_exllama is not None and self.use_exllama is not None:
+            # Only happens if user explicitly passes in both arguments
+            raise ValueError("Cannot specify both `disable_exllama` and `use_exllama`. Please use just `use_exllama`")
+
+        if self.exllama_config is None:
+            self.exllama_config = {"version": ExllamaVersion.ONE}
+        else:
+            if "version" not in self.exllama_config:
+                raise ValueError("`exllama_config` needs to have a `version` key.")
+            elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+                exllama_version = self.exllama_config["version"]
+                raise ValueError(
+                    f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
+                )
 
         if self.bits == 4 and self.use_exllama:
-            optimum_version = version.parse(importlib.metadata.version("optimum"))
-            # autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
-            # if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
-            if optimum_version <= version.parse("1.13.2"):
-                raise ValueError(
-                    # f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
-                    f"You need optimum > 1.13.2 . Make sure to have that version installed - detected version : optimum {optimum_version}"
+            if self.exllama_config["version"] == ExllamaVersion.ONE:
+                logger.info(
+                    "You have activated exllama backend. Note that you can get better inference "
+                    "speed using exllamav2 kernel by setting `exllama_config`."
                 )
+            elif self.exllama_config["version"] == ExllamaVersion.TWO:
+                optimum_version = version.parse(importlib.metadata.version("optimum"))
+                # autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
+                # if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
+                if optimum_version <= version.parse("1.13.2"):
+                    raise ValueError(
+                        # f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
+                        f"You need optimum > 1.13.2 . Make sure to have that version installed - detected version : optimum {optimum_version}"
+                    )
         if self.modules_in_block_to_quantize is not None:
             optimum_version = version.parse(importlib.metadata.version("optimum"))
             if optimum_version < version.parse("1.15.0"):