Skip to content

[FIX] gptq v2 load #724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/inference/run_transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))

13 changes: 13 additions & 0 deletions examples/quantization/transformers_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", quantization_config=gptq_config)
quantized_model.save_pretrained("./opt-125m-gptq")
tokenizer.save_pretrained("./opt-125m-gptq")

model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map="auto")

print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))
59 changes: 42 additions & 17 deletions gptqmodel/utils/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,38 +34,63 @@
}


def hf_select_quant_linear(
bits: int,
group_size: int,
desc_act: bool,
sym: bool,
backend: BACKEND = BACKEND.AUTO,
format: FORMAT = FORMAT.GPTQ,
pack: bool = False,
dynamic=None,
):
return select_quant_linear(
bits=bits,
group_size=group_size,
desc_act=desc_act,
sym=sym,
backend=backend,
format=format,
pack=pack,
dynamic=dynamic,
)


# auto select the correct/optimal QuantLinear class
def select_quant_linear(
bits: int,
group_size: int,
desc_act: bool,
sym: bool,
backend: BACKEND,
format: FORMAT,
backend: BACKEND = BACKEND.AUTO,
format: FORMAT = FORMAT.GPTQ,
pack: bool = False,
dynamic=None,
):
# Handle the case where backend is AUTO.
if backend == BACKEND.AUTO:
allow_backends = format_dict[format]
err = None
for k, values in backend_dict.items():
if not torch.cuda.is_available():
backend = BACKEND.IPEX
else:
allow_backends = format_dict[format]
err = None
for k, values in backend_dict.items():

for v in values:
in_allow_backends = k in allow_backends
validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
if in_allow_backends and validate:
if pack:
check_pack_func = hasattr(v, "pack")
if check_pack_func:
for v in values:
in_allow_backends = k in allow_backends
validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
if in_allow_backends and validate:
if pack:
check_pack_func = hasattr(v, "pack")
if check_pack_func:
logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
return v
else:
logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
return v
else:
logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
return v

if err:
raise err
if err:
raise err

# Handle the case where backend is not AUTO.
if backend == BACKEND.TRITON:
Expand Down
4 changes: 4 additions & 0 deletions gptqmodel/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from transformers.utils.hub import cached_file

from .backend import BACKEND
from .exllama import exllama_set_max_input_length
from .importer import select_quant_linear
from .logger import setup_logger
from .progress import ProgressBar
Expand Down Expand Up @@ -536,6 +537,9 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon

torch.cuda.empty_cache()

# if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear):
# model = exllama_set_max_input_length(model, max_input_length)

return model


Expand Down
68 changes: 68 additions & 0 deletions tests/test_transformers_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import unittest
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig


class TestTransformersIntegration(unittest.TestCase):

@classmethod
def setUpClass(self):
pass

def _test_load_quantized_model_gptq_v1(self, device_map):
model_id_or_path = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id_or_path,
device_map=device_map,)
generate_str = tokenizer.decode(quantized_model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(quantized_model.device))[0])
expect_str = "<s> The capital of France is is Paris.\nThe capital of France is Paris.\nThe capital of France is Paris.\nThe capital of France is Paris.\nThe capital of France is"
self.assertEqual(generate_str[:50], expect_str[:50])

def _test_load_quantized_model_gptq_v2(self, device_map):
model_id_or_path = "/monster/data/model/opt-125m/quant/2024-12-02_13-28-10_subcircularly_autogptq_version_pr640_bit4_group128_seq2048_batch16/damp0.1_descTrue_gptq_v2_symTrue_pack_dataFalse_mseTrue_mse_norm2.4_mse_grid100_mse_maxshrink0.8/c40_gr0_dic0_sen0_det0_rate0_native0_lm_compression1024_text_reduction0/opt_125m_gptqv2"
tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
device_map=device_map,)
generate_str = tokenizer.decode(quantized_model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(quantized_model.device))[0])
expect_str = "</s>The capital of France is is found velvetJustice ten for bowel Tuesday"

self.assertEqual(generate_str[:len(expect_str)], expect_str)

def _test_quantize(self, device_map):
model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = [
"gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map,
quantization_config=gptq_config)
quantized_model.save_pretrained("./opt-125m-gptq")
tokenizer.save_pretrained("./opt-125m-gptq")

model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map=device_map)

generate_str = tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0])

expect_str = "</s>gptqmodel is a good way to get a good way for a good way for a good way."

print('generate_str',generate_str)
print('expect_str',expect_str)

self.assertEqual(generate_str[:40], expect_str[:40])

def test_load_quantized_model_gptq_v1_ipex(self):
self._test_load_quantized_model_gptq_v1(device_map="cpu")

def test_load_quantized_model_gptq_v1_cuda(self):
self._test_load_quantized_model_gptq_v1(device_map="cuda")

def test_load_quantized_model_gptq_v2_ipex(self):
self._test_load_quantized_model_gptq_v2(device_map="cpu")

def test_load_quantized_model_gptq_v2_cuda(self):
self._test_load_quantized_model_gptq_v2(device_map="cuda")

def test_quantize_ipex(self):
self._test_quantize(device_map="cpu")

def test_quantize_cuda(self):
self._test_quantize(device_map="cuda")