@@ -16080,19 +16080,21 @@ static int llama_decode_internal(
16080
16080
return -1;
16081
16081
}
16082
16082
16083
- for (uint32_t i = 0; i < n_tokens_all; ++i) {
16084
- if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
16085
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16086
- return -1;
16087
- }
16088
- }
16089
-
16090
16083
const auto & model = lctx.model;
16091
16084
const auto & hparams = model.hparams;
16092
16085
const auto & cparams = lctx.cparams;
16093
16086
16094
16087
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
16095
16088
16089
+ if (batch_all.token) {
16090
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
16091
+ if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
16092
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16093
+ return -1;
16094
+ }
16095
+ }
16096
+ }
16097
+
16096
16098
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
16097
16099
16098
16100
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -16379,19 +16381,21 @@ static int llama_encode_internal(
16379
16381
return -1;
16380
16382
}
16381
16383
16382
- for (uint32_t i = 0; i < n_tokens; ++i) {
16383
- if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
16384
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16385
- return -1;
16386
- }
16387
- }
16388
-
16389
16384
const auto & model = lctx.model;
16390
16385
const auto & hparams = model.hparams;
16391
16386
const auto & cparams = lctx.cparams;
16392
16387
16393
16388
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
16394
16389
16390
+ if (batch.token) {
16391
+ for (uint32_t i = 0; i < n_tokens; ++i) {
16392
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
16393
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16394
+ return -1;
16395
+ }
16396
+ }
16397
+ }
16398
+
16395
16399
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
16396
16400
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
16397
16401
0 commit comments