Skip to content

llama : update the convert-llama2c-to-ggml example #5608

Closed
@ggerganov

Description

@ggerganov

The convert-llama2c-to-ggml is mostly functional, but can use some maintenance efforts. It also needs an update to support the n_head_kv parameter, required for multi-query models (e.g. stories260K).

Here is quick'n'dirty patch to make it work with stories260k which uses n_head = 8 and n_head_kv = 4:

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 8209dcb6..4aab8552 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -162,8 +162,8 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
     if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
     if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
     if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim/2, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim/2)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim/2, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim/2)) return 1;
     if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
     if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
     if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
@@ -383,8 +383,8 @@ static void init_model(struct my_llama_model * model) {
         layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 
         layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
         layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
 
         layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -697,8 +697,8 @@ static void save_as_llama_model(
 
         // from 3d matrix layer x dim x dim to 2d matrix dim x dim
         convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/2]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/2]);
         convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
 
         convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
@@ -737,7 +737,7 @@ static void save_as_llama_model(
     gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
     gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
     // n_head_kv is optional, default to n_head
-    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head/2);
     gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
     gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
     gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);

But obviously, a better implementation is necessary.

It would be also useful to add tests to our CI that perform llama2.c model conversions to GGUF. These small models could become useful for creating more efficient tests (e.g. #5566 (comment))

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions