kv-cache : simplify the interface (wip) [no ci]

ggerganov · ggerganov · commit 70321a1b8caa · 2025-05-20T17:49:14.000+03:00
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
     auto generate = [&](const std::string & prompt) {
         std::string response;
 
-        const bool is_first = llama_kv_self_used_cells(ctx) == 0;
+        const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
 
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_kv_self_used_cells(ctx);
+            int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
             if (n_ctx_used + batch.n_tokens > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
diff --git a/include/llama.h b/include/llama.h
@@ -610,10 +610,12 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_self_clear(
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -283,7 +283,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
     if (!batch.pos) {
         pos.resize(batch.n_tokens);
         for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = i + p0;
+            pos[i] = p0 + i + 1;
         }
         batch.pos = pos.data();
     }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -857,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -1;
     }
 
+    if (!inp_batch.pos) {
+        if (inp_batch.seq_id) {
+            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
+            return -1;
+        }
+    }
+
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
     // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0));
 
     const llama_batch & batch = batch_allocr.batch;
 
@@ -2292,22 +2298,26 @@ int32_t llama_apply_adapter_cvec(
 // kv cache
 //
 
+// deprecated
 int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
         return 0;
     }
 
-    return kv->get_n_tokens();
+#pragma message("implement me")
+    return 0;
 }
 
+// deprecated
 int32_t llama_kv_self_used_cells(const llama_context * ctx) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
         return 0;
     }
 
-    return kv->get_used_cells();
+#pragma message("implement me")
+    return 0;
 }
 
 void llama_kv_self_clear(llama_context * ctx) {
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -30,10 +30,11 @@ llama_kv_cache_unified::llama_kv_cache_unified(
                      bool    v_trans,
                      bool    offload,
                  uint32_t    kv_size,
-                 uint32_t    padding,
+                 uint32_t    n_seq_max,
+                 uint32_t    n_pad,
                  uint32_t    n_swa,
-           llama_swa_type    swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
-    GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
+           llama_swa_type    swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
+    GGML_ASSERT(kv_size % n_pad == 0 && "kv_size must be a multiple of padding");
 
     this->type_k = type_k;
     this->type_v = type_v;
@@ -442,7 +443,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
 void llama_kv_cache_unified::defrag_sched(float thold) {
     // - do not defrag small contexts (i.e. < 2048 tokens)
     // - count the padding towards the number of used tokens
-    const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
+    const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
 
     // queue defragmentation for next llama_kv_cache_update
     if (fragmentation > thold) {
@@ -558,7 +559,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
-    n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
+    n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
 
 #ifdef FIND_SLOT_DEBUG
     LLAMA_LOG_WARN("end:   n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
@@ -567,20 +568,6 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
     return true;
 }
 
-int32_t llama_kv_cache_unified::get_n_tokens() const {
-    int32_t result = 0;
-
-    for (uint32_t i = 0; i < size; i++) {
-        result += cells[i].seq_id.size();
-    }
-
-    return result;
-}
-
-int32_t llama_kv_cache_unified::get_used_cells() const {
-    return used;
-}
-
 bool llama_kv_cache_unified::get_can_shift() const {
     return true;
 }
@@ -802,16 +789,6 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
     }
 }
 
-llama_pos llama_kv_cache_unified::get_pos_max() const {
-    llama_pos pos_max = -1;
-
-    for (const auto & cell : cells) {
-        pos_max = std::max(pos_max, cell.pos);
-    }
-
-    return pos_max;
-}
-
 size_t llama_kv_cache_unified::total_size() const {
     size_t size = 0;
 
@@ -1655,17 +1632,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                 ggml_type   type_v,
                      bool   v_trans,
                      bool   offload,
-                 uint32_t   kv_size,
                      bool   swa_full,
+                 uint32_t   kv_size,
                  uint32_t   n_seq_max,
                  uint32_t   n_batch,
-                 uint32_t   padding) : hparams(model.hparams) {
+                 uint32_t   n_pad) : hparams(model.hparams) {
     llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
     llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
 
     const uint32_t size_base = kv_size;
 
-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
 
     // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
     if (swa_full) {
@@ -1680,14 +1657,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
 
     kv_base = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_base), type_k, type_v,
-            v_trans, offload, size_base, padding,
+            v_trans, offload, size_base, n_seq_max, n_pad,
             0, LLAMA_SWA_TYPE_NONE);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_swa), type_k, type_v,
-            v_trans, offload, size_swa, padding,
+            v_trans, offload, size_swa, n_seq_max, n_pad,
             hparams.n_swa, hparams.swa_type);
 }
 
@@ -1810,18 +1787,6 @@ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
     return res;
 }
 
-int32_t llama_kv_cache_unified_iswa::get_n_tokens()   const {
-    return kv_base->get_n_tokens();
-}
-
-int32_t llama_kv_cache_unified_iswa::get_used_cells() const {
-    return kv_base->get_used_cells();
-}
-
-llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
-    return kv_base->get_pos_max();
-}
-
 bool llama_kv_cache_unified_iswa::get_can_shift() const {
     return kv_base->get_size() == kv_swa->get_size();
 }
@@ -1853,7 +1818,8 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
                 ggml_type   type_k,
                 ggml_type   type_v,
                      bool   offload,
-                 uint32_t   kv_size) : hparams(model.hparams) {
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max) : hparams(model.hparams) {
     const int32_t n_layer = hparams.n_layer;
 
     LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
@@ -2203,8 +2169,8 @@ void llama_kv_cache_recurrent::commit() {
     pending.ranges.clear();
 }
 
-bool llama_kv_cache_recurrent::update(llama_context & lctx) {
-    GGML_UNUSED(lctx);
+bool llama_kv_cache_recurrent::update(llama_context & ctx) {
+    GGML_UNUSED(ctx);
     return false;
 }
 
@@ -2408,29 +2374,6 @@ bool llama_kv_cache_recurrent::find_slot(
     return n >= n_seqs;
 }
 
-int32_t llama_kv_cache_recurrent::get_n_tokens() const {
-    int32_t result = 0;
-
-    for (uint32_t i = 0; i < size; i++) {
-        result += cells[i].seq_id.size();
-    }
-
-    return result;
-}
-
-int32_t llama_kv_cache_recurrent::get_used_cells() const {
-    return used;
-}
-
-llama_pos llama_kv_cache_recurrent::get_pos_max() const {
-    llama_pos pos_max = -1;
-    for (const auto & cell : cells) {
-        pos_max = std::max(pos_max, cell.pos);
-    }
-
-    return pos_max;
-}
-
 bool llama_kv_cache_recurrent::get_can_shift() const {
     return false;
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -55,10 +55,7 @@ struct llama_kv_cache : public llama_memory_i {
     // =============================================================================================================
 
     // getters
-    virtual int32_t   get_n_tokens()   const = 0;
-    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
-    virtual llama_pos get_pos_max()    const = 0;
-    virtual bool      get_can_shift()  const = 0;
+    virtual bool get_can_shift() const = 0;
 
     bool get_can_edit() const override { return get_can_shift(); }
 
@@ -108,7 +105,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
                          bool    v_trans,
                          bool    offload,
                      uint32_t    kv_size,
-                     uint32_t    padding,
+                     uint32_t    n_seq_max,
+                     uint32_t    n_pad,
                      uint32_t    n_swa,
                llama_swa_type    swa_type);
 
@@ -150,12 +148,6 @@ class llama_kv_cache_unified : public llama_kv_cache {
     // to the first cell of the slot.
     bool find_slot(const llama_ubatch & batch) override;
 
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
-
     bool get_can_shift() const override;
 
     // state write/load
@@ -229,7 +221,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
     uint32_t n = 0;
 
     // required padding
-    uint32_t padding = 1;
+    uint32_t n_pad = 1;
 
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
@@ -317,11 +309,11 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
                     ggml_type   type_v,
                          bool   v_trans,
                          bool   offload,
-                     uint32_t   kv_size,
                          bool   swa_full,
+                     uint32_t   kv_size,
                      uint32_t   n_seq_max,
                      uint32_t   n_batch,
-                     uint32_t   padding);
+                     uint32_t   n_pad);
 
     ~llama_kv_cache_unified_iswa() = default;
 
@@ -358,12 +350,6 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
 
     bool find_slot(const llama_ubatch & batch) override;
 
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
-
     bool get_can_shift() const override;
 
     // state write/load
@@ -432,7 +418,8 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
                     ggml_type   type_k,
                     ggml_type   type_v,
                          bool   offload,
-                     uint32_t   kv_size);
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max);
 
     ~llama_kv_cache_recurrent() = default;
 
@@ -444,7 +431,7 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
 
     bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
     void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
     void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
     void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
 
@@ -458,7 +445,7 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     void restore() override;
     void commit()  override;
 
-    bool update(llama_context & lctx) override;
+    bool update(llama_context & ctx) override;
 
     void defrag_sched(float thold) override;
 
@@ -469,12 +456,6 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
 
     bool find_slot(const llama_ubatch & batch) override;
 
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
-
     bool get_can_shift() const override;
 
     // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/tools/run/run.cpp b/tools/run/run.cpp
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0`
`283`	`283`	`if (!batch.pos) {`
`284`	`284`	`pos.resize(batch.n_tokens);`
`285`	`285`	`for (int32_t i = 0; i < batch.n_tokens; i++) {`
`286`		`- pos[i] = i + p0;`
	`286`	`+ pos[i] = p0 + i + 1;`
`287`	`287`	`}`
`288`	`288`	`batch.pos = pos.data();`
`289`	`289`	`}`