@@ -1301,26 +1301,7 @@ struct server_context {
13011301 }
13021302
13031303 // check if there is incomplete UTF-8 character at the end
1304- bool incomplete = false ;
1305- for (unsigned i = 1 ; i < 5 && i <= slot.generated_text .size (); ++i) {
1306- unsigned char c = slot.generated_text [slot.generated_text .size () - i];
1307- if ((c & 0xC0 ) == 0x80 ) {
1308- // continuation byte: 10xxxxxx
1309- continue ;
1310- }
1311- if ((c & 0xE0 ) == 0xC0 ) {
1312- // 2-byte character: 110xxxxx ...
1313- incomplete = i < 2 ;
1314- } else if ((c & 0xF0 ) == 0xE0 ) {
1315- // 3-byte character: 1110xxxx ...
1316- incomplete = i < 3 ;
1317- } else if ((c & 0xF8 ) == 0xF0 ) {
1318- // 4-byte character: 11110xxx ...
1319- incomplete = i < 4 ;
1320- }
1321- // else 1-byte character or invalid byte
1322- break ;
1323- }
1304+ bool incomplete = validate_utf8 (slot.generated_text ) < slot.generated_text .size ();
13241305
13251306 if (!incomplete) {
13261307 size_t pos = std::min (slot.n_sent_text , slot.generated_text .size ());
@@ -1416,6 +1397,33 @@ struct server_context {
14161397 return slot.has_next_token ; // continue
14171398 }
14181399
1400+ void populate_token_probs (const server_slot & slot, completion_token_output & result, bool special, int idx) {
1401+ size_t n_probs = slot.sparams .n_probs ;
1402+ size_t n_vocab = llama_n_vocab (llama_get_model (ctx));
1403+
1404+ // TODO: optimize this with min-p optimization
1405+ std::vector<llama_token_data> cur = get_token_probabilities (ctx, idx);
1406+
1407+ // set probability for sampled token
1408+ for (size_t i = 0 ; i < n_vocab; i++) {
1409+ // set probability for sampled token
1410+ if (cur[i].id == result.tok ) {
1411+ result.prob = cur[i].p ;
1412+ break ;
1413+ }
1414+ }
1415+
1416+ // set probability for top n_probs tokens
1417+ result.probs .reserve (n_probs);
1418+ for (size_t i = 0 ; i < std::min (n_vocab, n_probs); i++) {
1419+ result.probs .push_back ({
1420+ cur[i].id ,
1421+ llama_detokenize (ctx, {cur[i].id }, special),
1422+ cur[i].p
1423+ });
1424+ }
1425+ }
1426+
14191427 json get_formated_generation (const server_slot & slot) const {
14201428 const auto eos_bias = slot.sparams .logit_bias .find (llama_token_eos (model));
14211429 const bool ignore_eos = eos_bias != slot.sparams .logit_bias .end () && eos_bias->second < 0 .0f && std::isinf (eos_bias->second );
@@ -1507,19 +1515,7 @@ struct server_context {
15071515 };
15081516
15091517 if (slot.sparams .n_probs > 0 ) {
1510- const std::vector<llama_token> to_send_toks = llama_tokenize (ctx, tkn.text_to_send , false );
1511- const size_t probs_pos = std::min (slot.n_sent_token_probs , slot.generated_token_probs .size ());
1512- const size_t probs_stop_pos = std::min (slot.n_sent_token_probs + to_send_toks.size (), slot.generated_token_probs .size ());
1513-
1514- std::vector<completion_token_output> probs_output;
1515- if (probs_pos < probs_stop_pos) {
1516- probs_output = std::vector<completion_token_output>(
1517- slot.generated_token_probs .begin () + probs_pos,
1518- slot.generated_token_probs .begin () + probs_stop_pos);
1519- }
1520- slot.n_sent_token_probs = probs_stop_pos;
1521-
1522- res.data [" completion_probabilities" ] = probs_vector_to_json (ctx, probs_output);
1518+ res.data [" completion_probabilities" ] = probs_vector_to_json (ctx, {tkn});
15231519 }
15241520
15251521 if (slot.oaicompat ) {
@@ -1559,7 +1555,7 @@ struct server_context {
15591555 {" timings" , slot.get_formated_timings ()}
15601556 };
15611557
1562- if (slot.sparams .n_probs > 0 ) {
1558+ if (!slot. params . stream && slot.sparams .n_probs > 0 ) {
15631559 std::vector<completion_token_output> probs;
15641560 if (!slot.params .stream && slot.stopped_word ) {
15651561 const std::vector<llama_token> stop_word_toks = llama_tokenize (ctx, slot.stopping_word , false );
@@ -2513,7 +2509,8 @@ struct server_context {
25132509 }
25142510
25152511 completion_token_output result;
2516- const llama_token id = llama_sampling_sample (slot.ctx_sampling , ctx, NULL , slot.i_batch - i);
2512+ const int tok_idx = slot.i_batch - i;
2513+ const llama_token id = llama_sampling_sample (slot.ctx_sampling , ctx, NULL , tok_idx);
25172514
25182515 llama_sampling_accept (slot.ctx_sampling , ctx, id, true );
25192516
@@ -2526,32 +2523,10 @@ struct server_context {
25262523
25272524 llama_token_data_array cur_p = { slot.ctx_sampling ->cur .data (), slot.ctx_sampling ->cur .size (), false };
25282525 result.tok = id;
2526+ result.prob = 1 .0f ; // TODO: set it here instead of doing inside populate_token_probs
25292527
2530- const size_t n_probs = std::min (cur_p.size , (size_t ) slot.sparams .n_probs );
2531- if (n_probs > 0 ) {
2532- const size_t n_valid = slot.ctx_sampling ->n_valid ;
2533-
2534- // Make sure at least n_probs top tokens are at the front of the vector:
2535- if (slot.sparams .temp == 0 .0f && n_probs > n_valid) {
2536- llama_sample_top_k (ctx, &cur_p, n_probs, 0 );
2537- }
2538-
2539- if (slot.sparams .temp == 0 .0f ) {
2540- // With greedy sampling the probabilities have possibly not been calculated.
2541- for (size_t i = 0 ; i < n_probs; ++i) {
2542- result.probs .push_back ({
2543- cur_p.data [i].id ,
2544- i == 0 ? 1 .0f : 0 .0f
2545- });
2546- }
2547- } else {
2548- for (size_t i = 0 ; i < n_probs; ++i) {
2549- result.probs .push_back ({
2550- cur_p.data [i].id ,
2551- i >= n_valid ? 0 .0f : cur_p.data [i].p // Tokens filtered out due to e.g. top_k have 0 probability.
2552- });
2553- }
2554- }
2528+ if (slot.sparams .n_probs > 0 ) {
2529+ populate_token_probs (slot, result, params.special , tok_idx);
25552530 }
25562531
25572532 if (!process_token (result, slot)) {
@@ -2601,6 +2576,12 @@ static json format_final_response_oaicompat(const json& request, json result, co
26012576 {" message" , json{{" content" , content},
26022577 {" role" , " assistant" }}}} });
26032578
2579+ if (result.contains (" completion_probabilities" )) {
2580+ choices[0 ][" logprobs" ] = json{
2581+ {" content" , json_value (result, " completion_probabilities" , json::array ())},
2582+ };
2583+ }
2584+
26042585 std::time_t t = std::time (0 );
26052586
26062587 json res = json{
@@ -2621,10 +2602,6 @@ static json format_final_response_oaicompat(const json& request, json result, co
26212602 res[" __verbose" ] = result;
26222603 }
26232604
2624- if (result.contains (" completion_probabilities" )) {
2625- res[" completion_probabilities" ] = json_value (result, " completion_probabilities" , json::array ());
2626- }
2627-
26282605 return res;
26292606}
26302607
@@ -2712,6 +2689,12 @@ static std::vector<json> format_partial_response_oaicompat(server_task_result ta
27122689 }
27132690 }
27142691
2692+ if (result.contains (" completion_probabilities" )) {
2693+ choices[0 ][" logprobs" ] = json{
2694+ {" content" , json_value (result, " completion_probabilities" , json::array ())},
2695+ };
2696+ }
2697+
27152698 json ret = json{
27162699 {" choices" , choices},
27172700 {" created" , t},
0 commit comments