llama : auto-batch

ggerganov · ggerganov · commit ca69f329895b · 2025-05-30T16:28:12.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -424,9 +424,9 @@ const llama_kv_cache * llama_context::get_kv_self() const {
     return kv_self;
 }
 
-void llama_context::kv_self_update() {
+bool llama_context::kv_self_update() {
     if (!memory) {
-        return;
+        return false;
     }
 
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
@@ -445,7 +445,11 @@ void llama_context::kv_self_update() {
         if (!gf) {
             LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
         }
+
+        return true;
     }
+
+    return false;
 }
 
 enum llama_pooling_type llama_context::pooling_type() const {
@@ -933,25 +937,53 @@ int llama_context::decode(llama_batch & inp_batch) {
     // handle any pending defrags/shifts
     kv_self_update();
 
-    auto kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
-    if (!kv_state) {
-        return -2;
-    }
+    llama_memory_state_ptr kv_state;
 
-    switch (kv_state->get_status()) {
-        case LLAMA_MEMORY_STATUS_SUCCESS:
-            {
-            } break;
-        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
-            {
-                // not a fatal error, we can re-try with a different batch
-                return 1;
-            }
-        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
-            {
-                return -2;
-            }
-    }
+    bool did_defrag = false;
+    auto n_ubatch = cparams.n_ubatch;
+
+    do {
+        kv_state = kv_self->init_batch(batch, n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
+        if (!kv_state) {
+            return -2;
+        }
+
+        switch (kv_state->get_status()) {
+            case LLAMA_MEMORY_STATUS_SUCCESS:
+                {
+                } break;
+            case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+                {
+                    if (!did_defrag) {
+                        did_defrag = true;
+
+                        kv_self->defrag_sched(-1.0f);
+                        if (kv_self_update()) {
+                            LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
+
+                            continue;
+                        }
+                    }
+
+                    if (n_ubatch > 1) {
+                        n_ubatch /= 2;
+
+                        LLAMA_LOG_DEBUG("%s: failed to find free space in the KV cache, retrying with smaller ubatch size: n_ubatch = %d\n", __func__, n_ubatch);
+                        continue;
+                    }
+
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+                    return 1;
+                }
+            case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+                {
+                    return -2;
+                }
+        }
+
+        break;
+    } while(true);
 
     // reserve output buffer
     if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -2646,22 +2678,8 @@ int32_t llama_encode(
 int32_t llama_decode(
         llama_context * ctx,
           llama_batch   batch) {
-    int ret = ctx->decode(batch);
-
-    // defrag and try again
-    // TODO: distinguish return code when we are sure that even after defrag there is no space available
-    if (ret == 1) {
-        llama_kv_self_defrag(ctx);
-        ret = ctx->decode(batch);
-
-        if (ret == 1) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
-
-            return ret;
-        }
-    }
-
-    if (ret != 0) {
+    const int ret = ctx->decode(batch);
+    if (ret != 0 && ret != 1) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -50,8 +50,9 @@ struct llama_context {
           llama_kv_cache * get_kv_self();
     const llama_kv_cache * get_kv_self() const;
 
+    // return true of the KV cache was updated
     // TODO: remove
-    void kv_self_update();
+    bool kv_self_update();
 
     enum llama_pooling_type pooling_type() const;
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3385,75 +3385,49 @@ struct server_context {
         }
 
         // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-
-            metrics.on_decoded(slots);
+        {
+            const int ret = llama_decode(ctx, batch);
 
             if (ret != 0) {
-                {
-                    std::string err;
-
-                    if (n_batch == 1 && ret == 1) {
-                        err = "Context size has been exceeded.";
-                    }
-
-                    if (ret == -1) {
-                        err = "Invalid input batch.";
-                    }
+                std::string err;
 
-                    if (ret < -1) {
-                        err = "Compute error.";
-                    }
-
-                    if (!err.empty()) {
-                        SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
-                        for (auto & slot : slots) {
-                            slot.release();
-                            send_error(slot, err);
-                        }
-                        break;
-                    }
+                if (ret == 1) {
+                    err = "Context size has been exceeded.";
                 }
 
-                // retry with half the batch size to try to find a free slot in the KV cache
-                n_batch /= 2;
+                if (ret == -1) {
+                    err = "Invalid input batch.";
+                }
 
-                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                if (ret < -1) {
+                    err = "Compute error.";
+                }
 
-                i -= n_batch;
+                if (!err.empty()) {
+                    SRV_ERR("%s, n_batch = %d, ret = %d\n", err.c_str(), n_batch, ret);
+                    for (auto & slot : slots) {
+                        slot.release();
+                        send_error(slot, err);
+                    }
 
-                continue; // continue loop of n_batch
+                    return;
+                }
             }
 
-            for (auto & slot : slots) {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
-                    continue; // continue loop of slots
-                }
+            metrics.on_decoded(slots);
 
+            for (auto & slot : slots) {
                 if (slot.state == SLOT_STATE_DONE_PROMPT) {
                     if (slot.task_type == SERVER_TASK_TYPE_EMBEDDING) {
                         // prompt evaluated for embedding
-                        send_embedding(slot, batch_view);
+                        send_embedding(slot, batch);
                         slot.release();
                         slot.i_batch = -1;
                         continue; // continue loop of slots
                     }
 
                     if (slot.task_type == SERVER_TASK_TYPE_RERANK) {
-                        send_rerank(slot, batch_view);
+                        send_rerank(slot, batch);
                         slot.release();
                         slot.i_batch = -1;
                         continue; // continue loop of slots
@@ -3465,7 +3439,7 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                const int tok_idx = slot.i_batch - i;
+                const int tok_idx = slot.i_batch;
 
                 llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);