Skip to content

Move save_quantized function into saver.py #467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 27 additions & 256 deletions gptqmodel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from transformers.modeling_utils import no_init_weights, shard_checkpoint
from transformers.models.mllama.modeling_mllama import MllamaCrossAttentionDecoderLayer
from transformers.utils.generic import ContextManagers
from .writer import ModelWriter

from ..nn_modules.qlinear.qlinear_exllamav2 import ExllamaV2QuantLinear
from ..nn_modules.qlinear.qlinear_qbits import QBitsQuantLinear, qbits_dtype
Expand Down Expand Up @@ -627,264 +628,31 @@ def save_quantized(
max_shard_size: Optional[str] = None,
model_base_name: Optional[str] = None
):
"""save quantized model and configs to local disk"""
os.makedirs(save_dir, exist_ok=True)

pre_quantized_size_mb = get_model_files_size(self.model_name_or_path)
pre_quantized_size_gb = pre_quantized_size_mb / 1024
checkpoint_file_name = ""
if hasattr(self, "checkpoint_file_name") and self.checkpoint_file_name is not None:
checkpoint_file_name = self.checkpoint_file_name

# write gptqmodel tooling fingerprint to config
self.quantize_config.meta_set_versionable(
key=META_FIELD_QUANTIZER,
value=META_QUANTIZER_GPTQMODEL,
version=__version__,
)

self.quantize_config.meta_set(
key=META_FIELD_URI,
value=META_VALUE_URI,
)

self.quantize_config.meta_set(
key=META_FIELD_DAMP_PERCENT,
value=self.quantize_config.damp_percent
)

self.quantize_config.meta_set(
key=META_FIELD_DAMP_AUTO_INCREMENT,
value=self.quantize_config.damp_auto_increment
)

# The config, quantize_config and model may be edited in place in save_quantized.
config = copy.deepcopy(self.model.config)
quantize_config = copy.deepcopy(self.quantize_config)

if not self.quantized:
raise ValueError("Save aborted as model is not quantized. Please call `quantize()` first.")

if model_base_name is None:
model_base_name = (
f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
)

if quantize_config.format == FORMAT.GPTQ_V2:
logger.warning(
f"Using 'format = {FORMAT.GPTQ_V2}': the serialized model is only supported by GPTQModel version >= {MIN_VERSION_WITH_V2}."
)

if not self.load_quantized_model:
model = self.model
# # internal is always gptq v2 but allow users to pass gptq (v1) via config
if quantize_config.format == FORMAT.GPTQ:
# Model qzeros may be edited in place.
model = convert_gptq_v2_to_v1_format(
model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel
)
else:
model = self.get_model_with_quantize(quantize_config, self.model_name_or_path)
model.to(CPU)
state_dict = model.state_dict()

model_base_name = "model"

if use_safetensors:
state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
model_save_name = model_base_name + ".safetensors"
else:
model_save_name = model_base_name + ".pt"

if not self.qlinear_kernel.SUPPORTS_SHARDS and max_shard_size is not None:
logger.warning("Sharding is not supported for this quant. Disabling sharding.")
max_shard_size = None

if max_shard_size is None:
if use_safetensors:
if safetensors_metadata is None:
safetensors_metadata = {}
elif not isinstance(safetensors_metadata, dict):
raise TypeError("safetensors_metadata must be a dictionary.")
else:
logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
new_safetensors_metadata = {}
converted_keys = False
for key, value in safetensors_metadata.items():
if not isinstance(key, str) or not isinstance(value, str):
converted_keys = True
try:
new_key = str(key)
new_value = str(value)
except Exception as e:
raise TypeError(
f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}"
)
if new_key in new_safetensors_metadata:
logger.warning(
f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting."
)
new_safetensors_metadata[new_key] = new_value
safetensors_metadata = new_safetensors_metadata
if converted_keys:
logger.debug(
f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}"
)

# Format is required to enable Accelerate to load the metadata
# otherwise it raises an OSError
safetensors_metadata["format"] = "pt"
safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
else:
logger.warning(
"We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.")
torch.save(model.state_dict(), join(save_dir, model_save_name))
total_size_mb = os.path.getsize(join(save_dir, model_save_name)) / (1024 * 1024)
else:
# Shard checkpoint
shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name)

# Clean the folder from a previous save
for filename in os.listdir(save_dir):
full_filename = join(save_dir, filename)

# make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "")
reg = re.compile(r"(.*?)-\d{5}-of-\d{5}")

if (
filename.startswith(model_base_name)
and isfile(full_filename)
and filename not in shards.keys()
and reg.fullmatch(filename_no_suffix) is not None
):
os.remove(full_filename)

total_size_mb = 0
# Save the model
for shard_file, shard in shards.items():
if use_safetensors:
if safetensors_metadata is None:
safetensors_metadata = {}
elif not isinstance(safetensors_metadata, dict):
raise TypeError("safetensors_metadata must be a dictionary.")
else:
logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
new_safetensors_metadata = {}
converted_keys = False
for key, value in safetensors_metadata.items():
if not isinstance(key, str) or not isinstance(value, str):
converted_keys = True
try:
new_key = str(key)
new_value = str(value)
except Exception as e:
raise TypeError(
f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
if new_key in new_safetensors_metadata:
logger.warning(
f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
new_safetensors_metadata[new_key] = new_value
safetensors_metadata = new_safetensors_metadata
if converted_keys:
logger.debug(
f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")

# Format is required to enable Accelerate to load the metadata
# otherwise it raises an OSError
safetensors_metadata["format"] = "pt"

safe_save(shard, join(save_dir, shard_file), safetensors_metadata)
else:
torch.save(shard, join(save_dir, shard_file))
shard_size_mb = os.path.getsize(join(save_dir, shard_file)) / (1024 * 1024)
total_size_mb += shard_size_mb

if index is not None:
index_save_name = model_save_name + ".index.json"
index_save_path = join(save_dir, index_save_name)
# Save the index as well
with open(index_save_path, "w", encoding="utf-8") as f:
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
f.write(content)

total_size_gb = total_size_mb / 1024
size_diff_mb = pre_quantized_size_mb - total_size_mb
size_diff_gb = size_diff_mb / 1024
percent_diff = (size_diff_mb / pre_quantized_size_mb) * 100
logger.info(f"Pre-Quantized model size: {pre_quantized_size_mb:.2f}MB, {pre_quantized_size_gb:.2f}GB")
logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
logger.info(f"Size difference: {size_diff_mb:.2f}MB, {size_diff_gb:.2f}GB - {percent_diff:.2f}%")

config.quantization_config = quantize_config.to_dict()
config.save_pretrained(save_dir)

quantize_config.save_pretrained(save_dir)

# need to copy .py files for model/tokenizers not yet merged to HF transformers
if self.trust_remote_code:
copy_py_files(save_dir, model_id_or_path=self.model_name_or_path)

def get_model_with_quantize(self, quantize_config, model_name_or_path):
config = AutoConfig.from_pretrained(
model_name_or_path,
trust_remote_code=True,
)

def skip(*args, **kwargs):
pass

torch.nn.init.kaiming_uniform_ = skip
torch.nn.init.uniform_ = skip
torch.nn.init.normal_ = skip
transformers.modeling_utils._init_weights = False
init_contexts = [no_init_weights()]
with ContextManagers(init_contexts):
model = self.model_loader.from_config(
config, torch_dtype=torch.float16
)

if self.dynamic_expert_index is not None:
num_experts = getattr(config, self.dynamic_expert_index)
self.layer_modules = get_moe_layer_modules(layer_modules=self.layer_modules,
num_experts=num_experts)

layers = find_layers(model)
ignore_layers = [self.lm_head] + self.base_modules

for name in list(layers.keys()):
# allow loading of quantized lm_head
if quantize_config.lm_head and name == self.lm_head:
continue

if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers) or all(
not name.endswith(ignore_layer) for sublist in self.layer_modules for ignore_layer in sublist
):
# log non-lm-head quantizerd layers only
if name is not self.lm_head:
logger.info(f"The layer {name} is not quantized.")
del layers[name]

make_quant(
model,
layers,
quantize_config.bits,
quantize_config.group_size,
backend=BACKEND.AUTO,
format=quantize_config.format,
desc_act=quantize_config.desc_act,
pack=True,
)
model.tie_weights()

accelerate.load_checkpoint_in_model(
model,
dtype=torch.float16,
# This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292
checkpoint=self.checkpoint_file_name,
# device_map=device_map,
# offload_state_dict=True,
# offload_buffers=True,
ModelWriter.save_quantized(
self,
save_dir=save_dir,
use_safetensors=use_safetensors,
max_shard_size=max_shard_size,
quantized=self.quantized,
model_name_or_path=self.model_name_or_path,
model=self.model,
load_quantized_model=self.load_quantized_model,
qlinear_kernel=self.qlinear_kernel,
trust_remote_code=self.trust_remote_code,
safetensors_metadata=safetensors_metadata,
quantize_config=self.quantize_config,
dynamic_expert_index=self.dynamic_expert_index,
base_modules=self.base_modules,
lm_head=self.lm_head,
layer_modules=self.layer_modules,
checkpoint_file_name=checkpoint_file_name
)
torch.cuda.empty_cache()
return model


def save_pretrained(
self,
Expand Down Expand Up @@ -1040,7 +808,7 @@ def from_quantized(
**kwargs,
):

model, quantize_config, qlinear_kernel, load_quantized_model, generate = ModelLoader.from_quantized(
model, quantize_config, qlinear_kernel, load_quantized_model, generate, checkpoint_file_name = ModelLoader.from_quantized(
model_name_or_path=model_name_or_path,
device_map=device_map,
max_memory=max_memory,
Expand All @@ -1064,6 +832,9 @@ def from_quantized(
if generate is not None:
cls.generate = generate

if checkpoint_file_name is not None:
cls.checkpoint_file_name = checkpoint_file_name

return cls(
model,
quantized=True,
Expand Down
4 changes: 3 additions & 1 deletion gptqmodel/models/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ def from_quantized(
quantize_config,
None, # qlinear_kernel
False, # load_quantized_model
cls.generate
cls.generate,
None # return None if is SGLANG or VLLM
)

if quantize_config.format == FORMAT.MARLIN:
Expand Down Expand Up @@ -526,4 +527,5 @@ def skip(*args, **kwargs):
qlinear_kernel,
True, # load_quantized_model
None, # return None if not SGLANG or VLLM
model.checkpoint_file_name
)
Loading