1
+ import unittest
2
+ from transformers import AutoModelForCausalLM , AutoTokenizer , GPTQConfig
3
+
4
+
5
+ class TestTransformersIntegration (unittest .TestCase ):
6
+
7
+ @classmethod
8
+ def setUpClass (self ):
9
+ pass
10
+
11
+ def _test_load_quantized_model_gptq_v1 (self , device_map ):
12
+ model_id_or_path = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
13
+ tokenizer = AutoTokenizer .from_pretrained (model_id_or_path )
14
+ quantized_model = AutoModelForCausalLM .from_pretrained (model_id_or_path ,
15
+ device_map = device_map ,)
16
+ generate_str = tokenizer .decode (quantized_model .generate (** tokenizer ("The capital of France is is" , return_tensors = "pt" ).to (quantized_model .device ))[0 ])
17
+ expect_str = "<s> The capital of France is is Paris.\n The capital of France is Paris.\n The capital of France is Paris.\n The capital of France is Paris.\n The capital of France is"
18
+ self .assertEqual (generate_str [:50 ], expect_str [:50 ])
19
+
20
+ def _test_load_quantized_model_gptq_v2 (self , device_map ):
21
+ model_id_or_path = "/monster/data/model/opt-125m/quant/2024-12-02_13-28-10_subcircularly_autogptq_version_pr640_bit4_group128_seq2048_batch16/damp0.1_descTrue_gptq_v2_symTrue_pack_dataFalse_mseTrue_mse_norm2.4_mse_grid100_mse_maxshrink0.8/c40_gr0_dic0_sen0_det0_rate0_native0_lm_compression1024_text_reduction0/opt_125m_gptqv2"
22
+ tokenizer = AutoTokenizer .from_pretrained (model_id_or_path )
23
+ quantized_model = AutoModelForCausalLM .from_pretrained ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ" ,
24
+ device_map = device_map ,)
25
+ generate_str = tokenizer .decode (quantized_model .generate (** tokenizer ("The capital of France is is" , return_tensors = "pt" ).to (quantized_model .device ))[0 ])
26
+ expect_str = "</s>The capital of France is is found velvetJustice ten for bowel Tuesday"
27
+
28
+ self .assertEqual (generate_str [:len (expect_str )], expect_str )
29
+
30
+ def _test_quantize (self , device_map ):
31
+ model_id = "facebook/opt-125m"
32
+ tokenizer = AutoTokenizer .from_pretrained (model_id )
33
+ dataset = [
34
+ "gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." ]
35
+ gptq_config = GPTQConfig (bits = 4 , dataset = dataset , tokenizer = tokenizer )
36
+ quantized_model = AutoModelForCausalLM .from_pretrained (model_id , device_map = device_map ,
37
+ quantization_config = gptq_config )
38
+ quantized_model .save_pretrained ("./opt-125m-gptq" )
39
+ tokenizer .save_pretrained ("./opt-125m-gptq" )
40
+
41
+ model = AutoModelForCausalLM .from_pretrained ("./opt-125m-gptq" , device_map = device_map )
42
+
43
+ generate_str = tokenizer .decode (model .generate (** tokenizer ("gptqmodel is" , return_tensors = "pt" ).to (model .device ))[0 ])
44
+
45
+ expect_str = "</s>gptqmodel is a good way to get a good way for a good way for a good way."
46
+
47
+ print ('generate_str' ,generate_str )
48
+ print ('expect_str' ,expect_str )
49
+
50
+ self .assertEqual (generate_str [:40 ], expect_str [:40 ])
51
+
52
+ def test_load_quantized_model_gptq_v1_ipex (self ):
53
+ self ._test_load_quantized_model_gptq_v1 (device_map = "cpu" )
54
+
55
+ def test_load_quantized_model_gptq_v1_cuda (self ):
56
+ self ._test_load_quantized_model_gptq_v1 (device_map = "cuda" )
57
+
58
+ def test_load_quantized_model_gptq_v2_ipex (self ):
59
+ self ._test_load_quantized_model_gptq_v2 (device_map = "cpu" )
60
+
61
+ def test_load_quantized_model_gptq_v2_cuda (self ):
62
+ self ._test_load_quantized_model_gptq_v2 (device_map = "cuda" )
63
+
64
+ def test_quantize_ipex (self ):
65
+ self ._test_quantize (device_map = "cpu" )
66
+
67
+ def test_quantize_cuda (self ):
68
+ self ._test_quantize (device_map = "cuda" )
0 commit comments