context : encode() clears embd_seq

ggerganov · ggerganov · commit 4f0ea9b30afc · 2025-04-25T15:19:44.000+03:00
ggml-ci
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -3941,7 +3941,7 @@ int main(int argc, char ** argv) {
     const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
             server_task_type type,
             json & data,
-            std::function<bool()> is_connection_closed,
+            const std::function<bool()> & is_connection_closed,
             httplib::Response & res,
             oaicompat_type oaicompat) {
         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -699,6 +699,8 @@ int llama_context::encode(llama_batch & inp_batch) {
         t_compute_start_us = ggml_time_us();
     }
 
+    embd_seq.clear();
+
     n_queued_tokens += n_tokens;
 
     const int64_t n_embd = hparams.n_embd;
@@ -839,13 +841,13 @@ int llama_context::encode(llama_batch & inp_batch) {
 }
 
 int llama_context::decode(llama_batch & inp_batch) {
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
     if (!memory) {
         LLAMA_LOG_WARN("%s: cannot decode batches with this context\n", __func__);
+        return encode(inp_batch);
+    }
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }