diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py new file mode 100644 index 000000000..348515d3a --- /dev/null +++ b/examples/inference/run_transformers.py @@ -0,0 +1,6 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") +quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") +print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0])) + diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py new file mode 100755 index 000000000..565c074a2 --- /dev/null +++ b/examples/quantization/transformers_usage.py @@ -0,0 +1,13 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig + +model_id = "facebook/opt-125m" +tokenizer = AutoTokenizer.from_pretrained(model_id) +dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) +quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", quantization_config=gptq_config) +quantized_model.save_pretrained("./opt-125m-gptq") +tokenizer.save_pretrained("./opt-125m-gptq") + +model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map="auto") + +print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0])) \ No newline at end of file diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py index 0f8ea85af..5c136503f 100644 --- a/gptqmodel/utils/importer.py +++ b/gptqmodel/utils/importer.py @@ -34,38 +34,63 @@ } +def hf_select_quant_linear( + bits: int, + group_size: int, + desc_act: bool, + sym: bool, + backend: BACKEND = BACKEND.AUTO, + format: FORMAT = FORMAT.GPTQ, + pack: bool = False, + dynamic=None, +): + return select_quant_linear( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + backend=backend, + format=format, + pack=pack, + dynamic=dynamic, + ) + + # auto select the correct/optimal QuantLinear class def select_quant_linear( bits: int, group_size: int, desc_act: bool, sym: bool, - backend: BACKEND, - format: FORMAT, + backend: BACKEND = BACKEND.AUTO, + format: FORMAT = FORMAT.GPTQ, pack: bool = False, dynamic=None, ): # Handle the case where backend is AUTO. if backend == BACKEND.AUTO: - allow_backends = format_dict[format] - err = None - for k, values in backend_dict.items(): + if not torch.cuda.is_available(): + backend = BACKEND.IPEX + else: + allow_backends = format_dict[format] + err = None + for k, values in backend_dict.items(): - for v in values: - in_allow_backends = k in allow_backends - validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic) - if in_allow_backends and validate: - if pack: - check_pack_func = hasattr(v, "pack") - if check_pack_func: + for v in values: + in_allow_backends = k in allow_backends + validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic) + if in_allow_backends and validate: + if pack: + check_pack_func = hasattr(v, "pack") + if check_pack_func: + logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") + return v + else: logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") return v - else: - logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") - return v - if err: - raise err + if err: + raise err # Handle the case where backend is not AUTO. if backend == BACKEND.TRITON: diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 3bea7eb13..c36a23135 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -21,6 +21,7 @@ from transformers.utils.hub import cached_file from .backend import BACKEND +from .exllama import exllama_set_max_input_length from .importer import select_quant_linear from .logger import setup_logger from .progress import ProgressBar @@ -536,6 +537,9 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon torch.cuda.empty_cache() + # if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear): + # model = exllama_set_max_input_length(model, max_input_length) + return model diff --git a/tests/test_transformers_integration.py b/tests/test_transformers_integration.py new file mode 100644 index 000000000..7dd15bf96 --- /dev/null +++ b/tests/test_transformers_integration.py @@ -0,0 +1,68 @@ +import unittest +from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig + + +class TestTransformersIntegration(unittest.TestCase): + + @classmethod + def setUpClass(self): + pass + + def _test_load_quantized_model_gptq_v1(self, device_map): + model_id_or_path = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ" + tokenizer = AutoTokenizer.from_pretrained(model_id_or_path) + quantized_model = AutoModelForCausalLM.from_pretrained(model_id_or_path, + device_map=device_map,) + generate_str = tokenizer.decode(quantized_model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(quantized_model.device))[0]) + expect_str = " The capital of France is is Paris.\nThe capital of France is Paris.\nThe capital of France is Paris.\nThe capital of France is Paris.\nThe capital of France is" + self.assertEqual(generate_str[:50], expect_str[:50]) + + def _test_load_quantized_model_gptq_v2(self, device_map): + model_id_or_path = "/monster/data/model/opt-125m/quant/2024-12-02_13-28-10_subcircularly_autogptq_version_pr640_bit4_group128_seq2048_batch16/damp0.1_descTrue_gptq_v2_symTrue_pack_dataFalse_mseTrue_mse_norm2.4_mse_grid100_mse_maxshrink0.8/c40_gr0_dic0_sen0_det0_rate0_native0_lm_compression1024_text_reduction0/opt_125m_gptqv2" + tokenizer = AutoTokenizer.from_pretrained(model_id_or_path) + quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + device_map=device_map,) + generate_str = tokenizer.decode(quantized_model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(quantized_model.device))[0]) + expect_str = "The capital of France is is found velvetJustice ten for bowel Tuesday" + + self.assertEqual(generate_str[:len(expect_str)], expect_str) + + def _test_quantize(self, device_map): + model_id = "facebook/opt-125m" + tokenizer = AutoTokenizer.from_pretrained(model_id) + dataset = [ + "gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] + gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) + quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map, + quantization_config=gptq_config) + quantized_model.save_pretrained("./opt-125m-gptq") + tokenizer.save_pretrained("./opt-125m-gptq") + + model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map=device_map) + + generate_str = tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]) + + expect_str = "gptqmodel is a good way to get a good way for a good way for a good way." + + print('generate_str',generate_str) + print('expect_str',expect_str) + + self.assertEqual(generate_str[:40], expect_str[:40]) + + def test_load_quantized_model_gptq_v1_ipex(self): + self._test_load_quantized_model_gptq_v1(device_map="cpu") + + def test_load_quantized_model_gptq_v1_cuda(self): + self._test_load_quantized_model_gptq_v1(device_map="cuda") + + def test_load_quantized_model_gptq_v2_ipex(self): + self._test_load_quantized_model_gptq_v2(device_map="cpu") + + def test_load_quantized_model_gptq_v2_cuda(self): + self._test_load_quantized_model_gptq_v2(device_map="cuda") + + def test_quantize_ipex(self): + self._test_quantize(device_map="cpu") + + def test_quantize_cuda(self): + self._test_quantize(device_map="cuda") \ No newline at end of file