ngxson · October 30, 2025 13:59
diff --git a/0001-rm-pos-vector.patch b/0001-rm-pos-vector.patch
 From 3c55062c1f816a2e93b421c65ae9b521f81b5fdb Mon Sep 17 00:00:00 2001
 From: Xuan Son Nguyen <[email protected]>
 Date: Thu, 30 Oct 2025 14:57:35 +0100
 Subject: [PATCH] rm pos vector

 ---
 tools/server/server.cpp | 17 +++++++++----
 tools/server/utils.hpp  | 56 ++++++++++++++++-------------------------
 2 files changed, 34 insertions(+), 39 deletions(-)

 diff --git a/tools/server/server.cpp b/tools/server/server.cpp
 index cd12e8479..9531f8bb0 100644
 --- a/tools/server/server.cpp
 +++ b/tools/server/server.cpp
 @@ -3908,8 +3908,9 @@ struct server_context {
                     }
 
                     // truncate any tokens that are beyond n_past for this slot
 -                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1)) {
 -                        SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", slot.prompt.n_tokens());
 +                    llama_pos p0 = slot.prompt.tokens.get_next_pos();
 +                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
 +                        SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0);
                         llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
 
                         // there is no common part left
 @@ -3918,7 +3919,7 @@ struct server_context {
                         slot.prompt.tokens.clear();
                     }
 
 -                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), slot.prompt.n_tokens());
 +                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
 
                     // remove the non-common part from the cache
                     slot.prompt.tokens.keep_first(slot.prompt.n_tokens());
 @@ -3927,7 +3928,13 @@ struct server_context {
                     if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
                         // process the image
                         size_t n_tokens_out = 0;
 -                        int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.id, n_tokens_out);
 +                        int32_t res = input_tokens.process_chunk(
 +                            ctx,
 +                            mctx,
 +                            slot.prompt.n_tokens(),
 +                            slot.prompt.tokens.get_next_pos(),
 +                            slot.id,
 +                            n_tokens_out);
                         if (res != 0) {
                             SLT_ERR(slot, "failed to process image, res = %d\n", res);
                             send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
 @@ -3994,7 +4001,7 @@ struct server_context {
                         // embedding requires all tokens in the batch to be output
                         common_batch_add(batch,
                             cur_tok,
 -                            input_tokens.get_pos(slot.prompt.n_tokens()),
 +                            slot.prompt.tokens.get_next_pos(),
                             { slot.id },
                             slot.need_embd());
                         slot.prompt.tokens.push_back(cur_tok);
 diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
 index 75c26f8a3..607be7f3e 100644
 --- a/tools/server/utils.hpp
 +++ b/tools/server/utils.hpp
 @@ -1088,12 +1088,9 @@ private: // disallow accessing these members directly, risking out-of-sync
     //   if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
     //   otherwise, it is a normal text token
     // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
 +    // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
     llama_tokens tokens;
 
 -    // the position per-token (llama_pos) in the overall input
 -    // useful for M-RoPE, where the position is different from the index in tokens
 -    std::vector<llama_pos> pos;
 -
     // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
     //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
     // idx  0   1   2   3   4   5      6      7      8      9      10
 @@ -1123,29 +1120,28 @@ public:
         }
     }
 
 -    server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
 -        for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
 -            pos.push_back(i);
 -        }
 -    }
 +    server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
 
 -    llama_pos next_pos() const {
 +    // get next position in the sequence
 +    // see: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2477778621
 +    llama_pos get_next_pos() const {
         if (tokens.empty()) {
             return 0;
 -        } else if (tokens.back() != LLAMA_TOKEN_NULL) {
 -            return pos.back() + 1;
 -        } else {
 -            // find the last media chunk
 -            GGML_ASSERT(has_mtmd);
 -            GGML_ASSERT(!map_idx_to_media.empty());
 -            const auto & chunk = map_idx_to_media.rbegin()->second;
 -            return pos.back() + mtmd_input_chunk_get_n_pos(chunk.get());
         }
 -    }
 
 -    llama_pos get_pos(size_t idx) const {
 -        GGML_ASSERT(idx < pos.size());
 -        return pos[idx];
 +        llama_pos pos = tokens.size();
 +        if (!has_mtmd || map_idx_to_media.empty()) {
 +            return pos;
 +        }
 +
 +        for (const auto & it : map_idx_to_media) {
 +            const auto & chunk = it.second;
 +            // substract media tokens
 +            pos -= mtmd_input_chunk_get_n_tokens(chunk.get());
 +            // add media positions
 +            pos += mtmd_input_chunk_get_n_pos(chunk.get());
 +        }
 +        return pos;
     }
 
     // for debugging
 @@ -1154,12 +1150,11 @@ public:
         oss << "tokens: ";
         for (size_t idx = 0; idx < tokens.size(); ++idx) {
             llama_token t = tokens[idx];
 -            llama_pos   p = pos[idx];
             oss << "idx:" << idx << " ";
             if (t == LLAMA_TOKEN_NULL) {
 -                oss << "<embd>(" << p << ")\n";
 +                oss << "<embd>\n";
             } else {
 -                oss << t << "(" << p << ")\n";
 +                oss << t << "\n";
             }
         }
         oss << "\n";
 @@ -1182,7 +1177,6 @@ public:
         if (tok == LLAMA_TOKEN_NULL) {
             throw std::runtime_error("Invalid token");
         }
 -        pos.emplace_back(next_pos());
         tokens.emplace_back(tok);
     }
 
 @@ -1192,10 +1186,8 @@ public:
         if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
             GGML_ASSERT(has_mtmd);
             const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
 -            const llama_pos cur_pos = next_pos();
             size_t start_idx = tokens.size();
             for (size_t i = 0; i < n_tokens; ++i) {
 -                pos.emplace_back(cur_pos);
                 tokens.emplace_back(LLAMA_TOKEN_NULL);
             }
             mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
 @@ -1233,11 +1225,6 @@ public:
     void insert(const llama_tokens & inp_tokens) {
         GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
         tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
 -        // rebuild the pos vector
 -        pos.clear();
 -        for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
 -            pos.emplace_back(i);
 -        }
     }
 
     // for compatibility with speculative decoding, ctx shift, slot save/load
 @@ -1386,6 +1373,7 @@ public:
                 llama_context * ctx,
                 mtmd_context * mctx,
                 size_t idx,
 +                llama_pos pos,
                 int32_t seq_id,
                 size_t & n_tokens_out) const {
         const auto & chunk = find_chunk(idx);
 @@ -1397,7 +1385,7 @@ public:
         llama_pos new_n_past; // unused for now
         int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
             chunk.get(),
 -            pos[idx], // position
 +            pos,
             seq_id,
             n_batch,
             true, // logits last
 -- 
 2.39.5 (Apple Git-154)
	From 3c55062c1f816a2e93b421c65ae9b521f81b5fdb Mon Sep 17 00:00:00 2001
	From: Xuan Son Nguyen <[email protected]>
	Date: Thu, 30 Oct 2025 14:57:35 +0100
	Subject: [PATCH] rm pos vector

	---
	tools/server/server.cpp \| 17 +++++++++----
	tools/server/utils.hpp \| 56 ++++++++++++++++-------------------------
	2 files changed, 34 insertions(+), 39 deletions(-)

	diff --git a/tools/server/server.cpp b/tools/server/server.cpp
	index cd12e8479..9531f8bb0 100644
	--- a/tools/server/server.cpp
	+++ b/tools/server/server.cpp
	@@ -3908,8 +3908,9 @@ struct server_context {
	}

	// truncate any tokens that are beyond n_past for this slot
	- if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1)) {
	- SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", slot.prompt.n_tokens());
	+ llama_pos p0 = slot.prompt.tokens.get_next_pos();
	+ if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
	+ SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0);
	llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);

	// there is no common part left
	@@ -3918,7 +3919,7 @@ struct server_context {
	slot.prompt.tokens.clear();
	}

	- SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), slot.prompt.n_tokens());
	+ SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);

	// remove the non-common part from the cache
	slot.prompt.tokens.keep_first(slot.prompt.n_tokens());
	@@ -3927,7 +3928,13 @@ struct server_context {
	if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
	// process the image
	size_t n_tokens_out = 0;
	- int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.id, n_tokens_out);
	+ int32_t res = input_tokens.process_chunk(
	+ ctx,
	+ mctx,
	+ slot.prompt.n_tokens(),
	+ slot.prompt.tokens.get_next_pos(),
	+ slot.id,
	+ n_tokens_out);
	if (res != 0) {
	SLT_ERR(slot, "failed to process image, res = %d\n", res);
	send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
	@@ -3994,7 +4001,7 @@ struct server_context {
	// embedding requires all tokens in the batch to be output
	common_batch_add(batch,
	cur_tok,
	- input_tokens.get_pos(slot.prompt.n_tokens()),
	+ slot.prompt.tokens.get_next_pos(),
	{ slot.id },
	slot.need_embd());
	slot.prompt.tokens.push_back(cur_tok);
	diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
	index 75c26f8a3..607be7f3e 100644
	--- a/tools/server/utils.hpp
	+++ b/tools/server/utils.hpp
	@@ -1088,12 +1088,9 @@ private: // disallow accessing these members directly, risking out-of-sync
	// if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
	// otherwise, it is a normal text token
	// note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
	+ // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
	llama_tokens tokens;

	- // the position per-token (llama_pos) in the overall input
	- // useful for M-RoPE, where the position is different from the index in tokens
	- std::vector<llama_pos> pos;
	-
	// for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
	// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
	// idx 0 1 2 3 4 5 6 7 8 9 10
	@@ -1123,29 +1120,28 @@ public:
	}
	}

	- server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
	- for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
	- pos.push_back(i);
	- }
	- }
	+ server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}

	- llama_pos next_pos() const {
	+ // get next position in the sequence
	+ // see: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2477778621
	+ llama_pos get_next_pos() const {
	if (tokens.empty()) {
	return 0;
	- } else if (tokens.back() != LLAMA_TOKEN_NULL) {
	- return pos.back() + 1;
	- } else {
	- // find the last media chunk
	- GGML_ASSERT(has_mtmd);
	- GGML_ASSERT(!map_idx_to_media.empty());
	- const auto & chunk = map_idx_to_media.rbegin()->second;
	- return pos.back() + mtmd_input_chunk_get_n_pos(chunk.get());
	}
	- }

	- llama_pos get_pos(size_t idx) const {
	- GGML_ASSERT(idx < pos.size());
	- return pos[idx];
	+ llama_pos pos = tokens.size();
	+ if (!has_mtmd \|\| map_idx_to_media.empty()) {
	+ return pos;
	+ }
	+
	+ for (const auto & it : map_idx_to_media) {
	+ const auto & chunk = it.second;
	+ // substract media tokens
	+ pos -= mtmd_input_chunk_get_n_tokens(chunk.get());
	+ // add media positions
	+ pos += mtmd_input_chunk_get_n_pos(chunk.get());
	+ }
	+ return pos;
	}

	// for debugging
	@@ -1154,12 +1150,11 @@ public:
	oss << "tokens: ";
	for (size_t idx = 0; idx < tokens.size(); ++idx) {
	llama_token t = tokens[idx];
	- llama_pos p = pos[idx];
	oss << "idx:" << idx << " ";
	if (t == LLAMA_TOKEN_NULL) {
	- oss << "<embd>(" << p << ")\n";
	+ oss << "<embd>\n";
	} else {
	- oss << t << "(" << p << ")\n";
	+ oss << t << "\n";
	}
	}
	oss << "\n";
	@@ -1182,7 +1177,6 @@ public:
	if (tok == LLAMA_TOKEN_NULL) {
	throw std::runtime_error("Invalid token");
	}
	- pos.emplace_back(next_pos());
	tokens.emplace_back(tok);
	}

	@@ -1192,10 +1186,8 @@ public:
	if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE \|\| type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
	GGML_ASSERT(has_mtmd);
	const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
	- const llama_pos cur_pos = next_pos();
	size_t start_idx = tokens.size();
	for (size_t i = 0; i < n_tokens; ++i) {
	- pos.emplace_back(cur_pos);
	tokens.emplace_back(LLAMA_TOKEN_NULL);
	}
	mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
	@@ -1233,11 +1225,6 @@ public:
	void insert(const llama_tokens & inp_tokens) {
	GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
	tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
	- // rebuild the pos vector
	- pos.clear();
	- for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
	- pos.emplace_back(i);
	- }
	}

	// for compatibility with speculative decoding, ctx shift, slot save/load
	@@ -1386,6 +1373,7 @@ public:
	llama_context * ctx,
	mtmd_context * mctx,
	size_t idx,
	+ llama_pos pos,
	int32_t seq_id,
	size_t & n_tokens_out) const {
	const auto & chunk = find_chunk(idx);
	@@ -1397,7 +1385,7 @@ public:
	llama_pos new_n_past; // unused for now
	int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
	chunk.get(),
	- pos[idx], // position
	+ pos,
	seq_id,
	n_batch,
	true, // logits last
	--
	2.39.5 (Apple Git-154)
No results found