@@ -158,6 +158,7 @@ struct llama_client_slot
158
158
int32_t n_decoded = 0 ;
159
159
int32_t n_remaining = -1 ;
160
160
int32_t i_batch = -1 ;
161
+ int32_t n_predict = -1 ;
161
162
162
163
int32_t num_prompt_tokens = 0 ;
163
164
int32_t num_prompt_tokens_processed = 0 ;
@@ -409,6 +410,7 @@ struct llama_server_context
409
410
410
411
slot.id = i;
411
412
slot.n_ctx = n_ctx_slot;
413
+ slot.n_predict = params.n_predict ;
412
414
413
415
LOG_TEE (" -> Slot %i - max context: %i\n " , slot.id , n_ctx_slot);
414
416
@@ -545,6 +547,15 @@ struct llama_server_context
545
547
slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
546
548
slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
547
549
550
+ if (slot->n_predict > 0 && slot->params .n_predict > slot->n_predict ) {
551
+ // Might be better to reject the request with a 400 ?
552
+ LOG_WARNING (" Max tokens to predict exceeds server configuration" , {
553
+ {" params.n_predict" , slot->params .n_predict },
554
+ {" slot.n_predict" , slot->n_predict },
555
+ });
556
+ slot->params .n_predict = slot->n_predict ;
557
+ }
558
+
548
559
// infill
549
560
if (data.count (" input_prefix" ) != 0 )
550
561
{
@@ -1052,6 +1063,7 @@ struct llama_server_context
1052
1063
1053
1064
return json {
1054
1065
{" n_ctx" , slot.n_ctx },
1066
+ {" n_predict" , slot.n_predict },
1055
1067
{" model" , params.model_alias },
1056
1068
{" seed" , slot.params .seed },
1057
1069
{" temperature" , slot.sparams .temp },
0 commit comments