Closed
Description
The convert-llama2c-to-ggml is mostly functional, but can use some maintenance efforts. It also needs an update to support the n_head_kv
parameter, required for multi-query models (e.g. stories260K).
Here is quick'n'dirty patch to make it work with stories260k
which uses n_head = 8
and n_head_kv = 4
:
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 8209dcb6..4aab8552 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -162,8 +162,8 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
- if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
- if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+ if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim/2, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim/2)) return 1;
+ if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim/2, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim/2)) return 1;
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
@@ -383,8 +383,8 @@ static void init_model(struct my_llama_model * model) {
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
- layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
- layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+ layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+ layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -697,8 +697,8 @@ static void save_as_llama_model(
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
- convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
- convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
+ convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/2]);
+ convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/2]);
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
@@ -737,7 +737,7 @@ static void save_as_llama_model(
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
// n_head_kv is optional, default to n_head
- // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
+ gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head/2);
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
But obviously, a better implementation is necessary.
It would be also useful to add tests to our CI that perform llama2.c
model conversions to GGUF. These small models could become useful for creating more efficient tests (e.g. #5566 (comment))