Skip to content

Instantly share code, notes, and snippets.

@ngxson
Created October 30, 2025 13:59
Show Gist options
  • Select an option

  • Save ngxson/b7743b2c76633d9c61ae3eb5e6addf0d to your computer and use it in GitHub Desktop.

Select an option

Save ngxson/b7743b2c76633d9c61ae3eb5e6addf0d to your computer and use it in GitHub Desktop.
From 3c55062c1f816a2e93b421c65ae9b521f81b5fdb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <[email protected]>
Date: Thu, 30 Oct 2025 14:57:35 +0100
Subject: [PATCH] rm pos vector
---
tools/server/server.cpp | 17 +++++++++----
tools/server/utils.hpp | 56 ++++++++++++++++-------------------------
2 files changed, 34 insertions(+), 39 deletions(-)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index cd12e8479..9531f8bb0 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3908,8 +3908,9 @@ struct server_context {
}
// truncate any tokens that are beyond n_past for this slot
- if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1)) {
- SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", slot.prompt.n_tokens());
+ llama_pos p0 = slot.prompt.tokens.get_next_pos();
+ if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
+ SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0);
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
// there is no common part left
@@ -3918,7 +3919,7 @@ struct server_context {
slot.prompt.tokens.clear();
}
- SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), slot.prompt.n_tokens());
+ SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
// remove the non-common part from the cache
slot.prompt.tokens.keep_first(slot.prompt.n_tokens());
@@ -3927,7 +3928,13 @@ struct server_context {
if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
// process the image
size_t n_tokens_out = 0;
- int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.id, n_tokens_out);
+ int32_t res = input_tokens.process_chunk(
+ ctx,
+ mctx,
+ slot.prompt.n_tokens(),
+ slot.prompt.tokens.get_next_pos(),
+ slot.id,
+ n_tokens_out);
if (res != 0) {
SLT_ERR(slot, "failed to process image, res = %d\n", res);
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
@@ -3994,7 +4001,7 @@ struct server_context {
// embedding requires all tokens in the batch to be output
common_batch_add(batch,
cur_tok,
- input_tokens.get_pos(slot.prompt.n_tokens()),
+ slot.prompt.tokens.get_next_pos(),
{ slot.id },
slot.need_embd());
slot.prompt.tokens.push_back(cur_tok);
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 75c26f8a3..607be7f3e 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1088,12 +1088,9 @@ private: // disallow accessing these members directly, risking out-of-sync
// if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
// otherwise, it is a normal text token
// note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
+ // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
llama_tokens tokens;
- // the position per-token (llama_pos) in the overall input
- // useful for M-RoPE, where the position is different from the index in tokens
- std::vector<llama_pos> pos;
-
// for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
// idx 0 1 2 3 4 5 6 7 8 9 10
@@ -1123,29 +1120,28 @@ public:
}
}
- server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
- for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
- pos.push_back(i);
- }
- }
+ server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
- llama_pos next_pos() const {
+ // get next position in the sequence
+ // see: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2477778621
+ llama_pos get_next_pos() const {
if (tokens.empty()) {
return 0;
- } else if (tokens.back() != LLAMA_TOKEN_NULL) {
- return pos.back() + 1;
- } else {
- // find the last media chunk
- GGML_ASSERT(has_mtmd);
- GGML_ASSERT(!map_idx_to_media.empty());
- const auto & chunk = map_idx_to_media.rbegin()->second;
- return pos.back() + mtmd_input_chunk_get_n_pos(chunk.get());
}
- }
- llama_pos get_pos(size_t idx) const {
- GGML_ASSERT(idx < pos.size());
- return pos[idx];
+ llama_pos pos = tokens.size();
+ if (!has_mtmd || map_idx_to_media.empty()) {
+ return pos;
+ }
+
+ for (const auto & it : map_idx_to_media) {
+ const auto & chunk = it.second;
+ // substract media tokens
+ pos -= mtmd_input_chunk_get_n_tokens(chunk.get());
+ // add media positions
+ pos += mtmd_input_chunk_get_n_pos(chunk.get());
+ }
+ return pos;
}
// for debugging
@@ -1154,12 +1150,11 @@ public:
oss << "tokens: ";
for (size_t idx = 0; idx < tokens.size(); ++idx) {
llama_token t = tokens[idx];
- llama_pos p = pos[idx];
oss << "idx:" << idx << " ";
if (t == LLAMA_TOKEN_NULL) {
- oss << "<embd>(" << p << ")\n";
+ oss << "<embd>\n";
} else {
- oss << t << "(" << p << ")\n";
+ oss << t << "\n";
}
}
oss << "\n";
@@ -1182,7 +1177,6 @@ public:
if (tok == LLAMA_TOKEN_NULL) {
throw std::runtime_error("Invalid token");
}
- pos.emplace_back(next_pos());
tokens.emplace_back(tok);
}
@@ -1192,10 +1186,8 @@ public:
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
GGML_ASSERT(has_mtmd);
const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
- const llama_pos cur_pos = next_pos();
size_t start_idx = tokens.size();
for (size_t i = 0; i < n_tokens; ++i) {
- pos.emplace_back(cur_pos);
tokens.emplace_back(LLAMA_TOKEN_NULL);
}
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
@@ -1233,11 +1225,6 @@ public:
void insert(const llama_tokens & inp_tokens) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
- // rebuild the pos vector
- pos.clear();
- for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
- pos.emplace_back(i);
- }
}
// for compatibility with speculative decoding, ctx shift, slot save/load
@@ -1386,6 +1373,7 @@ public:
llama_context * ctx,
mtmd_context * mctx,
size_t idx,
+ llama_pos pos,
int32_t seq_id,
size_t & n_tokens_out) const {
const auto & chunk = find_chunk(idx);
@@ -1397,7 +1385,7 @@ public:
llama_pos new_n_past; // unused for now
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
chunk.get(),
- pos[idx], // position
+ pos,
seq_id,
n_batch,
true, // logits last
--
2.39.5 (Apple Git-154)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment