-
-
Save ngxson/b7743b2c76633d9c61ae3eb5e6addf0d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| From 3c55062c1f816a2e93b421c65ae9b521f81b5fdb Mon Sep 17 00:00:00 2001 | |
| From: Xuan Son Nguyen <[email protected]> | |
| Date: Thu, 30 Oct 2025 14:57:35 +0100 | |
| Subject: [PATCH] rm pos vector | |
| --- | |
| tools/server/server.cpp | 17 +++++++++---- | |
| tools/server/utils.hpp | 56 ++++++++++++++++------------------------- | |
| 2 files changed, 34 insertions(+), 39 deletions(-) | |
| diff --git a/tools/server/server.cpp b/tools/server/server.cpp | |
| index cd12e8479..9531f8bb0 100644 | |
| --- a/tools/server/server.cpp | |
| +++ b/tools/server/server.cpp | |
| @@ -3908,8 +3908,9 @@ struct server_context { | |
| } | |
| // truncate any tokens that are beyond n_past for this slot | |
| - if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1)) { | |
| - SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", slot.prompt.n_tokens()); | |
| + llama_pos p0 = slot.prompt.tokens.get_next_pos(); | |
| + if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { | |
| + SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0); | |
| llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); | |
| // there is no common part left | |
| @@ -3918,7 +3919,7 @@ struct server_context { | |
| slot.prompt.tokens.clear(); | |
| } | |
| - SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), slot.prompt.n_tokens()); | |
| + SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); | |
| // remove the non-common part from the cache | |
| slot.prompt.tokens.keep_first(slot.prompt.n_tokens()); | |
| @@ -3927,7 +3928,13 @@ struct server_context { | |
| if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { | |
| // process the image | |
| size_t n_tokens_out = 0; | |
| - int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.id, n_tokens_out); | |
| + int32_t res = input_tokens.process_chunk( | |
| + ctx, | |
| + mctx, | |
| + slot.prompt.n_tokens(), | |
| + slot.prompt.tokens.get_next_pos(), | |
| + slot.id, | |
| + n_tokens_out); | |
| if (res != 0) { | |
| SLT_ERR(slot, "failed to process image, res = %d\n", res); | |
| send_error(slot, "failed to process image", ERROR_TYPE_SERVER); | |
| @@ -3994,7 +4001,7 @@ struct server_context { | |
| // embedding requires all tokens in the batch to be output | |
| common_batch_add(batch, | |
| cur_tok, | |
| - input_tokens.get_pos(slot.prompt.n_tokens()), | |
| + slot.prompt.tokens.get_next_pos(), | |
| { slot.id }, | |
| slot.need_embd()); | |
| slot.prompt.tokens.push_back(cur_tok); | |
| diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp | |
| index 75c26f8a3..607be7f3e 100644 | |
| --- a/tools/server/utils.hpp | |
| +++ b/tools/server/utils.hpp | |
| @@ -1088,12 +1088,9 @@ private: // disallow accessing these members directly, risking out-of-sync | |
| // if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk | |
| // otherwise, it is a normal text token | |
| // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list | |
| + // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos | |
| llama_tokens tokens; | |
| - // the position per-token (llama_pos) in the overall input | |
| - // useful for M-RoPE, where the position is different from the index in tokens | |
| - std::vector<llama_pos> pos; | |
| - | |
| // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos): | |
| // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1] | |
| // idx 0 1 2 3 4 5 6 7 8 9 10 | |
| @@ -1123,29 +1120,28 @@ public: | |
| } | |
| } | |
| - server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) { | |
| - for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) { | |
| - pos.push_back(i); | |
| - } | |
| - } | |
| + server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {} | |
| - llama_pos next_pos() const { | |
| + // get next position in the sequence | |
| + // see: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2477778621 | |
| + llama_pos get_next_pos() const { | |
| if (tokens.empty()) { | |
| return 0; | |
| - } else if (tokens.back() != LLAMA_TOKEN_NULL) { | |
| - return pos.back() + 1; | |
| - } else { | |
| - // find the last media chunk | |
| - GGML_ASSERT(has_mtmd); | |
| - GGML_ASSERT(!map_idx_to_media.empty()); | |
| - const auto & chunk = map_idx_to_media.rbegin()->second; | |
| - return pos.back() + mtmd_input_chunk_get_n_pos(chunk.get()); | |
| } | |
| - } | |
| - llama_pos get_pos(size_t idx) const { | |
| - GGML_ASSERT(idx < pos.size()); | |
| - return pos[idx]; | |
| + llama_pos pos = tokens.size(); | |
| + if (!has_mtmd || map_idx_to_media.empty()) { | |
| + return pos; | |
| + } | |
| + | |
| + for (const auto & it : map_idx_to_media) { | |
| + const auto & chunk = it.second; | |
| + // substract media tokens | |
| + pos -= mtmd_input_chunk_get_n_tokens(chunk.get()); | |
| + // add media positions | |
| + pos += mtmd_input_chunk_get_n_pos(chunk.get()); | |
| + } | |
| + return pos; | |
| } | |
| // for debugging | |
| @@ -1154,12 +1150,11 @@ public: | |
| oss << "tokens: "; | |
| for (size_t idx = 0; idx < tokens.size(); ++idx) { | |
| llama_token t = tokens[idx]; | |
| - llama_pos p = pos[idx]; | |
| oss << "idx:" << idx << " "; | |
| if (t == LLAMA_TOKEN_NULL) { | |
| - oss << "<embd>(" << p << ")\n"; | |
| + oss << "<embd>\n"; | |
| } else { | |
| - oss << t << "(" << p << ")\n"; | |
| + oss << t << "\n"; | |
| } | |
| } | |
| oss << "\n"; | |
| @@ -1182,7 +1177,6 @@ public: | |
| if (tok == LLAMA_TOKEN_NULL) { | |
| throw std::runtime_error("Invalid token"); | |
| } | |
| - pos.emplace_back(next_pos()); | |
| tokens.emplace_back(tok); | |
| } | |
| @@ -1192,10 +1186,8 @@ public: | |
| if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { | |
| GGML_ASSERT(has_mtmd); | |
| const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk); | |
| - const llama_pos cur_pos = next_pos(); | |
| size_t start_idx = tokens.size(); | |
| for (size_t i = 0; i < n_tokens; ++i) { | |
| - pos.emplace_back(cur_pos); | |
| tokens.emplace_back(LLAMA_TOKEN_NULL); | |
| } | |
| mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); | |
| @@ -1233,11 +1225,6 @@ public: | |
| void insert(const llama_tokens & inp_tokens) { | |
| GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled | |
| tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end()); | |
| - // rebuild the pos vector | |
| - pos.clear(); | |
| - for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) { | |
| - pos.emplace_back(i); | |
| - } | |
| } | |
| // for compatibility with speculative decoding, ctx shift, slot save/load | |
| @@ -1386,6 +1373,7 @@ public: | |
| llama_context * ctx, | |
| mtmd_context * mctx, | |
| size_t idx, | |
| + llama_pos pos, | |
| int32_t seq_id, | |
| size_t & n_tokens_out) const { | |
| const auto & chunk = find_chunk(idx); | |
| @@ -1397,7 +1385,7 @@ public: | |
| llama_pos new_n_past; // unused for now | |
| int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx, | |
| chunk.get(), | |
| - pos[idx], // position | |
| + pos, | |
| seq_id, | |
| n_batch, | |
| true, // logits last | |
| -- | |
| 2.39.5 (Apple Git-154) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment