@@ -910,7 +910,7 @@ struct server_context
910
910
slot.sparams .penalize_nl = json_value (data, " penalize_nl" , default_sparams.penalize_nl );
911
911
slot.params .n_keep = json_value (data, " n_keep" , slot.params .n_keep );
912
912
slot.params .n_discard = json_value (data, " n_discard" , default_params.n_discard );
913
- slot.params .seed = json_value (data, " seed" , default_params .seed );
913
+ slot.sparams .seed = json_value (data, " seed" , default_sparams .seed );
914
914
slot.sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
915
915
slot.sparams .min_keep = json_value (data, " min_keep" , default_sparams.min_keep );
916
916
slot.sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
@@ -1209,7 +1209,7 @@ struct server_context
1209
1209
bool process_token (completion_token_output &result, server_slot &slot)
1210
1210
{
1211
1211
// remember which tokens were sampled - used for repetition penalties during sampling
1212
- const std::string token_str = llama_token_to_piece (ctx, result.tok );
1212
+ const std::string token_str = llama_token_to_piece (ctx, result.tok , false );
1213
1213
slot.sampled = result.tok ;
1214
1214
1215
1215
// search stop word and delete it
@@ -1314,6 +1314,27 @@ struct server_context
1314
1314
LOG_VERBOSE (" eos token found" , {});
1315
1315
}
1316
1316
1317
+ auto n_ctx_train = llama_n_ctx_train (model);
1318
+ if (slot.params .n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
1319
+ && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
1320
+ LOG_WARNING (" n_predict is not set and self-context extend is disabled."
1321
+ " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop" , {
1322
+ { " id_slot" , slot.id },
1323
+ { " params.n_predict" , slot.params .n_predict },
1324
+ { " slot.n_prompt_tokens" , slot.n_prompt_tokens },
1325
+ { " slot.n_decoded" , slot.n_decoded },
1326
+ { " slot.n_predict" , slot.n_predict },
1327
+ { " n_slots" , params.n_parallel },
1328
+ { " slot.n_ctx" , slot.n_ctx },
1329
+ { " n_ctx" , n_ctx },
1330
+ { " n_ctx_train" , n_ctx_train },
1331
+ { " ga_n" , slot.ga_n },
1332
+ });
1333
+ slot.truncated = true ;
1334
+ slot.stopped_limit = true ;
1335
+ slot.has_next_token = false ; // stop prediction
1336
+ }
1337
+
1317
1338
LOG_VERBOSE (" next token" , {
1318
1339
{" id_slot" , slot.id },
1319
1340
{" id_task" , slot.id_task },
@@ -1475,8 +1496,9 @@ struct server_context
1475
1496
{
1476
1497
const std::vector<llama_token> stop_word_toks = llama_tokenize (ctx, slot.stopping_word , false );
1477
1498
1499
+ size_t safe_offset = std::min (slot.generated_token_probs .size (), stop_word_toks.size ());
1478
1500
probs = std::vector<completion_token_output>(slot.generated_token_probs .begin (),
1479
- slot.generated_token_probs .end () - stop_word_toks. size () );
1501
+ slot.generated_token_probs .end () - safe_offset );
1480
1502
}
1481
1503
else
1482
1504
{
@@ -2313,7 +2335,7 @@ struct server_context
2313
2335
});
2314
2336
2315
2337
// process the created batch of tokens
2316
- for (int32_t i = 0 ; i < ( int32_t ) batch.n_tokens ; i += n_batch)
2338
+ for (int32_t i = 0 ; i < batch.n_tokens ; i += n_batch)
2317
2339
{
2318
2340
const int32_t n_tokens = std::min (n_batch, batch.n_tokens - i);
2319
2341
@@ -2534,6 +2556,7 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
2534
2556
params.embedding = json_value (jparams, " embedding" , default_params.embedding );
2535
2557
params.escape = json_value (jparams, " escape" , default_params.escape );
2536
2558
params.cont_batching = json_value (jparams, " cont_batching" , default_params.cont_batching );
2559
+ params.flash_attn = json_value (jparams, " flash_attn" , default_params.flash_attn );
2537
2560
params.input_prefix_bos = json_value (jparams, " input_prefix_bos" , default_params.input_prefix_bos );
2538
2561
params.ignore_eos = json_value (jparams, " ignore_eos" , default_params.ignore_eos );
2539
2562
params.use_mmap = json_value (jparams, " use_mmap" , default_params.use_mmap );
@@ -2596,4 +2619,6 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
2596
2619
LOG_WARNING (" llama.cpp was compiled without CUDA. It is not possible to set a main GPU." , {});
2597
2620
#endif
2598
2621
}
2622
+
2623
+ gpt_params_handle_model_default (params);
2599
2624
}
0 commit comments