[FIX] gptq v2 load (#724)

ZX-ModelCloud · jiqing-feng · web-flow · commit 2587e1a415bd · 2024-12-02T22:30:29.000+08:00
* transformers

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;

* add hf_select_quant_layer

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;

* add transformers inference example

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;

* add unittest

---------

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
Co-authored-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py
@@ -0,0 +1,6 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
+quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
+print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
+
diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py
@@ -0,0 +1,13 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", quantization_config=gptq_config)
+quantized_model.save_pretrained("./opt-125m-gptq")
+tokenizer.save_pretrained("./opt-125m-gptq")
+
+model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map="auto")
+
+print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
@@ -34,38 +34,63 @@
 }
 
 
+def hf_select_quant_linear(
+        bits: int,
+        group_size: int,
+        desc_act: bool,
+        sym: bool,
+        backend: BACKEND = BACKEND.AUTO,
+        format: FORMAT = FORMAT.GPTQ,
+        pack: bool = False,
+        dynamic=None,
+):
+    return select_quant_linear(
+        bits=bits,
+        group_size=group_size,
+        desc_act=desc_act,
+        sym=sym,
+        backend=backend,
+        format=format,
+        pack=pack,
+        dynamic=dynamic,
+    )
+
+
 # auto select the correct/optimal QuantLinear class
 def select_quant_linear(
         bits: int,
         group_size: int,
         desc_act: bool,
         sym: bool,
-        backend: BACKEND,
-        format: FORMAT,
+        backend: BACKEND = BACKEND.AUTO,
+        format: FORMAT = FORMAT.GPTQ,
         pack: bool = False,
         dynamic=None,
 ):
     # Handle the case where backend is AUTO.
     if backend == BACKEND.AUTO:
-        allow_backends = format_dict[format]
-        err = None
-        for k, values in backend_dict.items():
+        if not torch.cuda.is_available():
+            backend = BACKEND.IPEX
+        else:
+            allow_backends = format_dict[format]
+            err = None
+            for k, values in backend_dict.items():
 
-            for v in values:
-                in_allow_backends = k in allow_backends
-                validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
-                if in_allow_backends and validate:
-                    if pack:
-                        check_pack_func = hasattr(v, "pack")
-                        if check_pack_func:
+                for v in values:
+                    in_allow_backends = k in allow_backends
+                    validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
+                    if in_allow_backends and validate:
+                        if pack:
+                            check_pack_func = hasattr(v, "pack")
+                            if check_pack_func:
+                                logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
+                                return v
+                        else:
                             logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
                             return v
-                    else:
-                        logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
-                        return v
 
-        if err:
-            raise err
+            if err:
+                raise err
 
     # Handle the case where backend is not AUTO.
     if backend == BACKEND.TRITON:
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -21,6 +21,7 @@
 from transformers.utils.hub import cached_file
 
 from .backend import BACKEND
+from .exllama import exllama_set_max_input_length
 from .importer import select_quant_linear
 from .logger import setup_logger
 from .progress import ProgressBar
@@ -536,6 +537,9 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon
 
     torch.cuda.empty_cache()
 
+    # if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear):
+    #     model = exllama_set_max_input_length(model, max_input_length)
+
     return model
 
 
diff --git a/tests/test_transformers_integration.py b/tests/test_transformers_integration.py
@@ -0,0 +1,68 @@
+import unittest
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+
+class TestTransformersIntegration(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        pass
+
+    def _test_load_quantized_model_gptq_v1(self, device_map):
+        model_id_or_path = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
+        tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
+        quantized_model = AutoModelForCausalLM.from_pretrained(model_id_or_path,
+                                                               device_map=device_map,)
+        generate_str = tokenizer.decode(quantized_model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(quantized_model.device))[0])
+        expect_str = "<s> The capital of France is is Paris.\nThe capital of France is Paris.\nThe capital of France is Paris.\nThe capital of France is Paris.\nThe capital of France is"
+        self.assertEqual(generate_str[:50], expect_str[:50])
+
+    def _test_load_quantized_model_gptq_v2(self, device_map):
+        model_id_or_path = "/monster/data/model/opt-125m/quant/2024-12-02_13-28-10_subcircularly_autogptq_version_pr640_bit4_group128_seq2048_batch16/damp0.1_descTrue_gptq_v2_symTrue_pack_dataFalse_mseTrue_mse_norm2.4_mse_grid100_mse_maxshrink0.8/c40_gr0_dic0_sen0_det0_rate0_native0_lm_compression1024_text_reduction0/opt_125m_gptqv2"
+        tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
+        quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+                                                               device_map=device_map,)
+        generate_str = tokenizer.decode(quantized_model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(quantized_model.device))[0])
+        expect_str = "</s>The capital of France is is found velvetJustice ten for bowel Tuesday"
+
+        self.assertEqual(generate_str[:len(expect_str)], expect_str)
+
+    def _test_quantize(self, device_map):
+        model_id = "facebook/opt-125m"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        dataset = [
+            "gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+        gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
+        quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map,
+                                                               quantization_config=gptq_config)
+        quantized_model.save_pretrained("./opt-125m-gptq")
+        tokenizer.save_pretrained("./opt-125m-gptq")
+
+        model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map=device_map)
+
+        generate_str = tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0])
+
+        expect_str = "</s>gptqmodel is a good way to get a good way for a good way for a good way."
+
+        print('generate_str',generate_str)
+        print('expect_str',expect_str)
+
+        self.assertEqual(generate_str[:40], expect_str[:40])
+
+    def test_load_quantized_model_gptq_v1_ipex(self):
+        self._test_load_quantized_model_gptq_v1(device_map="cpu")
+
+    def test_load_quantized_model_gptq_v1_cuda(self):
+        self._test_load_quantized_model_gptq_v1(device_map="cuda")
+
+    def test_load_quantized_model_gptq_v2_ipex(self):
+        self._test_load_quantized_model_gptq_v2(device_map="cpu")
+
+    def test_load_quantized_model_gptq_v2_cuda(self):
+        self._test_load_quantized_model_gptq_v2(device_map="cuda")
+
+    def test_quantize_ipex(self):
+        self._test_quantize(device_map="cpu")
+
+    def test_quantize_cuda(self):
+        self._test_quantize(device_map="cuda")