Skip to content

Commit deb7dfc

Browse files
authored
gguf : add ftype meta info to the model (#2710)
* llama : add ftype meta info to the model ggml-ci * convert.py : add ftype when converting (does not work) * convert.py : fix Enum to IntEnum ggml-ci
1 parent bac6699 commit deb7dfc

File tree

4 files changed

+47
-9
lines changed

4 files changed

+47
-9
lines changed

convert.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,10 @@ class UnquantizedDataType:
6969
'I32': DT_I32,
7070
}
7171

72-
class GGMLFileType(enum.Enum):
72+
# TODO: match this with `llama_ftype`
73+
# TODO: rename to LLAMAFileType
74+
# TODO: move to `gguf.py`
75+
class GGMLFileType(enum.IntEnum):
7376
AllF32 = 0
7477
MostlyF16 = 1 # except 1d tensors
7578

@@ -101,6 +104,8 @@ class Params:
101104
n_head_kv: int
102105
f_norm_eps: float
103106

107+
ftype: Optional[GGMLFileType] = None
108+
104109
@staticmethod
105110
def find_n_mult(n_ff: int, n_embd: int) -> int:
106111
# hardcoded magic range
@@ -738,6 +743,9 @@ def add_meta_arch(self, params: Params) -> None:
738743
self.gguf.add_head_count_kv (params.n_head_kv)
739744
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
740745

746+
if params.ftype:
747+
self.gguf.add_file_type(params.ftype)
748+
741749
def add_meta_vocab(self, vocab: Vocab) -> None:
742750
tokens = []
743751
scores = []
@@ -1020,6 +1028,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
10201028
" - LLaMA v2: --ctx 4096\n")
10211029
params.n_ctx = args.ctx
10221030

1031+
if args.outtype:
1032+
params.ftype = {
1033+
"f32": GGMLFileType.AllF32,
1034+
"f16": GGMLFileType.MostlyF16,
1035+
}[args.outtype]
1036+
10231037
print(f"params = {params}")
10241038

10251039
vocab: Vocab
@@ -1040,11 +1054,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
10401054
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
10411055
vocab = load_vocab(vocab_dir, args.vocabtype)
10421056

1043-
model = model_plus.model
1044-
model = convert_model_names(model, params)
1045-
output_type = pick_output_type(model, args.outtype)
1046-
model = convert_to_output_type(model, output_type)
1047-
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
1057+
model = model_plus.model
1058+
model = convert_model_names(model, params)
1059+
ftype = pick_output_type(model, args.outtype)
1060+
model = convert_to_output_type(model, ftype)
1061+
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
1062+
1063+
params.ftype = ftype
1064+
print(f"Writing {outfile}, format {ftype}")
10481065

10491066
OutputFile.write_all(outfile, params, model, vocab)
10501067
print(f"Wrote {outfile}")

gguf.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
KEY_GENERAL_LICENSE = "general.license"
2727
KEY_GENERAL_SOURCE_URL = "general.source.url"
2828
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
29+
KEY_GENERAL_FILE_TYPE = "general.file_type"
2930

3031
# LLM
3132
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
@@ -595,6 +596,9 @@ def add_source_url(self, url: str):
595596
def add_source_hf_repo(self, repo: str):
596597
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
597598

599+
def add_file_type(self, ftype: int):
600+
self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
601+
598602
def add_name(self, name: str):
599603
self.add_string(KEY_GENERAL_NAME, name)
600604

llama.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,16 @@ struct llama_model_loader {
995995
} break;
996996
}
997997

998+
// this is a way to mark that we have "guessed" the file type
999+
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
1000+
1001+
{
1002+
const int kid = gguf_find_key(ctx_gguf, "general.file_type");
1003+
if (kid >= 0) {
1004+
ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
1005+
}
1006+
}
1007+
9981008
for (int i = 0; i < n_kv; i++) {
9991009
const char * name = gguf_get_key(ctx_gguf, i);
10001010
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@@ -1197,7 +1207,11 @@ struct llama_model_loader {
11971207
// load LLaMA models
11981208
//
11991209

1200-
const char * llama_model_ftype_name(enum llama_ftype ftype) {
1210+
std::string llama_model_ftype_name(enum llama_ftype ftype) {
1211+
if (ftype & LLAMA_FTYPE_GUESSED) {
1212+
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1213+
}
1214+
12011215
switch (ftype) {
12021216
case LLAMA_FTYPE_ALL_F32: return "all F32";
12031217
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
@@ -1426,7 +1440,7 @@ static void llama_model_load_internal(
14261440
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
14271441
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
14281442
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1429-
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype));
1443+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
14301444
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml->n_elements*1e-9);
14311445

14321446
// general kv
@@ -3450,6 +3464,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
34503464
// copy the KV pairs from the input file
34513465
gguf_set_kv (ctx_out, model_loader->ctx_gguf);
34523466
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
3467+
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
34533468

34543469
#ifdef GGML_USE_K_QUANTS
34553470
int n_attention_wv = 0;
@@ -4310,7 +4325,7 @@ int llama_model_n_embd(const struct llama_model * model) {
43104325
}
43114326

43124327
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
4313-
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype));
4328+
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str());
43144329
}
43154330

43164331
int llama_model_quantize(

llama.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ extern "C" {
103103
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
104104
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
105105
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
106+
107+
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
106108
};
107109

108110
typedef struct llama_token_data {

0 commit comments

Comments
 (0)