22#include <speculative.h>
24#include <mtmd-helper.h>
44bool ends_with(
const std::string& text,
const std::string& suffix) {
45 return text.size() >= suffix.size()
46 && text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0;
57bool check_stop_sequences(
58 const std::string& text,
59 const std::vector<std::string>& stop_sequences)
61 for (
const auto& stop : stop_sequences) {
62 if (!stop.empty() && ends_with(text, stop)) {
75GenerationResult prefill_error() {
78 r.error_message =
"Prefill decode failed";
79 r.finish_reason =
"error";
89void log_sampler_config(
const GenerationParams& params) {
90 logger->info(
"Sampler: temp={:.2f}, top_k={}, top_p={:.2f}, "
91 "repeat_penalty={:.2f}, thinking={}",
92 params.temperature, params.top_k, params.top_p,
93 params.repeat_penalty, params.enable_thinking);
103void finalize_result(GenerationResult& result,
104 std::chrono::steady_clock::time_point start_time)
109 if (result.token_count > 0 && result.generation_time_ms > 0.0) {
110 result.throughput_tok_s =
111 static_cast<double>(result.token_count)
112 / result.generation_time_ms * 1000.0;
114 logger->info(
"Generated: {} tokens, finish={}, {:.0f}ms, "
116 result.token_count, result.finish_reason,
117 result.generation_time_ms, result.throughput_tok_s);
118 logger->info(
"Content:\n{}", result.content);
130ggml_type parse_kv_cache_type(
const std::string& s) {
131 static const std::pair<const char*, ggml_type> kTable[] = {
132 {
"f16", GGML_TYPE_F16},
133 {
"f32", GGML_TYPE_F32},
134 {
"bf16", GGML_TYPE_BF16},
135 {
"q8_0", GGML_TYPE_Q8_0},
136 {
"q4_0", GGML_TYPE_Q4_0},
138 for (
const auto& [name, type] : kTable) {
139 if (s == name) {
return type; }
141 logger->warn(
"Unknown cache_type '{}' — defaulting to f16", s);
142 return GGML_TYPE_F16;
161 llama_model_params mparams = llama_model_default_params();
162 mparams.n_gpu_layers = 0;
163 mparams.use_mmap =
true;
174 logger->info(
"Model loaded (CPU): {} tokens in vocab, recurrent={}",
203 llama_context_params c = llama_context_default_params();
205 c.n_batch =
static_cast<uint32_t
>(cfg.
n_batch);
208 : std::thread::hardware_concurrency();
210 ? LLAMA_FLASH_ATTN_TYPE_ENABLED
211 : LLAMA_FLASH_ATTN_TYPE_DISABLED;
244 llama_model_params mparams = llama_model_default_params();
246 mparams.use_mmap =
true;
249 if (!
config().tensor_split.empty()) {
251 logger->warn(
"tensor_split not yet implemented, ignoring");
254 llama_model* new_model = llama_model_load_from_file(
255 config().path.c_str(), mparams);
262 last_error_ =
"Failed to reload model with GPU layers "
264 +
", gpu_layers=" + std::to_string(
config().gpu_layers)
265 +
") — check llama_ggml.log in the engine's log_dir "
266 "for the underlying llama.cpp/CUDA error";
283 llama_context_params cparams = build_cparams(
config());
285 ctx_ = llama_init_from_model(
model_, cparams);
291 logger->info(
"Context created: n_ctx={}, n_batch={}, "
292 "flash_attn={}, type_k={}, type_v={}",
301 logger->info(
"Prompt cache initialized: max_bytes={}",
320 if (
config().mmproj_path.empty()) {
324 auto ctx_params = mtmd_context_params_default();
327 ? LLAMA_FLASH_ATTN_TYPE_ENABLED
328 : LLAMA_FLASH_ATTN_TYPE_DISABLED;
329 ctx_params.print_timings =
false;
333 logger->error(
"mtmd_init_from_file failed for {} — "
334 "continuing in text-only mode",
335 config().mmproj_path.string());
340 logger->info(
"mmproj loaded from {} — vision={}",
363 llama_model_params mparams = llama_model_default_params();
364 mparams.n_gpu_layers = 0;
365 mparams.use_mmap =
true;
368 llama_model* cpu_model = llama_model_load_from_file(
369 config().path.c_str(), mparams);
375 logger->warn(
"Failed to reload CPU model during deactivate, keeping GPU model");
431 const std::string& text,
bool add_special)
const
434 int n = llama_tokenize(
vocab_, text.c_str(),
435 static_cast<int32_t
>(text.size()),
436 nullptr, 0, add_special,
true);
441 std::vector<llama_token> tokens(
static_cast<size_t>(n));
442 int actual = llama_tokenize(
vocab_, text.c_str(),
443 static_cast<int32_t
>(text.size()),
444 tokens.data(), n, add_special,
true);
446 logger->error(
"Tokenization failed for text of length {}", text.size());
449 tokens.resize(
static_cast<size_t>(actual));
483 int n = llama_token_to_piece(
vocab_, token, buf,
sizeof(buf), 0,
false);
486 std::vector<char> large(
static_cast<size_t>(-n));
487 n = llama_token_to_piece(
vocab_, token, large.data(),
488 static_cast<int32_t
>(large.size()), 0,
false);
490 return std::string(large.data(),
static_cast<size_t>(n));
494 return std::string(buf,
static_cast<size_t>(n));
505 auto tokens =
tokenize(text,
false);
506 return static_cast<int>(tokens.size());
517 const std::string& text)
const {
519 return {tokens.begin(), tokens.end()};
539 const int32_t* tokens,
542 int n_vocab = llama_vocab_n_tokens(
vocab_);
544 result.
tokens.assign(tokens, tokens + n_tokens);
549 auto* mem = llama_get_memory(
ctx_);
550 llama_memory_clear(mem,
true);
552 for (
int i = 0; i < n_tokens; i++) {
553 llama_token tok = tokens[i];
554 llama_batch batch = llama_batch_get_one(&tok, 1);
555 int rc = llama_decode(
ctx_, batch);
557 llama_memory_clear(mem,
true);
558 throw std::runtime_error(
"llama_decode failed at logprob pos");
560 if (i < n_tokens - 1) {
561 const float* logits = llama_get_logits_ith(
ctx_, -1);
563 logits, tokens[i + 1], n_vocab);
569 for (
float lp : result.
logprobs) { sum += lp; }
572 -sum /
static_cast<float>(result.
n_logprobs));
574 llama_memory_clear(mem,
true);
623 float max_logit = logits[0];
624 for (
int v = 1; v < n_vocab; v++) {
625 if (logits[v] > max_logit) {
626 max_logit = logits[v];
629 float sum_exp = 0.0f;
630 for (
int v = 0; v < n_vocab; v++) {
631 sum_exp += std::exp(logits[v] - max_logit);
633 float log_sum_exp = max_logit + std::log(sum_exp);
634 return logits[next_token] - log_sum_exp;
660 const std::vector<Message>& messages) {
661 std::vector<llama_chat_message> chat_msgs;
662 chat_msgs.reserve(messages.size());
663 for (
const auto& msg : messages) {
664 chat_msgs.push_back({msg.role.c_str(), msg.content.c_str()});
677 const std::vector<Message>& messages) {
678 std::string fallback;
679 for (
const auto& msg : messages) {
680 fallback += msg.role +
": " + msg.content +
"\n";
691 const std::vector<Message>& messages,
697 int n = llama_chat_apply_template(
698 nullptr, chat_msgs.data(), chat_msgs.size(),
701 logger->error(
"llama_chat_apply_template failed (size query)");
705 std::vector<char> buf(
static_cast<size_t>(n + 1));
706 int written = llama_chat_apply_template(
707 nullptr, chat_msgs.data(), chat_msgs.size(),
708 true, buf.data(),
static_cast<int32_t
>(buf.size()));
710 logger->error(
"llama_chat_apply_template failed (render)");
714 return std::string(buf.data(),
static_cast<size_t>(written));
735 llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
736 llama_sampler* chain = llama_sampler_chain_init(chain_params);
740 llama_sampler* grammar = llama_sampler_init_grammar(
743 llama_sampler_chain_add(chain, grammar);
749 llama_sampler_chain_add(chain,
750 llama_sampler_init_penalties(
756 llama_sampler_chain_add(chain,
761 if (params.
top_k > 0) {
762 llama_sampler_chain_add(chain,
763 llama_sampler_init_top_k(params.
top_k));
767 if (params.
top_p < 1.0f) {
768 llama_sampler_chain_add(chain,
769 llama_sampler_init_top_p(params.
top_p, 1));
773 uint32_t seed = params.
seed < 0
775 :
static_cast<uint32_t
>(params.
seed);
776 llama_sampler_chain_add(chain, llama_sampler_init_dist(seed));
791 llama_memory_clear(llama_get_memory(
ctx_),
true);
794 const int n_tokens =
static_cast<int>(tokens.size());
796 for (
int i = 0; i < n_tokens; i += n_batch) {
797 int chunk = std::min(n_batch, n_tokens - i);
798 std::vector<llama_token> slice(
799 tokens.begin() + i, tokens.begin() + i + chunk);
800 llama_batch batch = llama_batch_get_one(
801 slice.data(),
static_cast<int32_t
>(chunk));
802 if (llama_decode(
ctx_, batch) != 0) {
803 logger->error(
"Prefill decode failed at offset {}", i);
821 llama_sampler* sampler,
822 std::string& generated,
823 std::function<
void(std::string_view)>& on_token,
824 const std::vector<std::string>& stop)
826 llama_token new_token = llama_sampler_sample(sampler,
ctx_, -1);
828 if (new_token == llama_vocab_eos(
vocab_)
829 || llama_vocab_is_eog(
vocab_, new_token)) {
836 on_token(std::string_view(piece));
838 if (check_stop_sequences(generated, stop)) {
842 llama_token tok = new_token;
843 llama_batch single = llama_batch_get_one(&tok, 1);
844 return (llama_decode(
ctx_, single) == 0) ?
"continue" :
"error";
858 const std::vector<llama_token>& tokens,
860 std::function<
void(std::string_view)> on_token,
861 std::atomic<bool>* cancel)
870 llama_sampler_free(sampler);
874 std::string generated;
878 bool cancelled = cancel && cancel->load(std::memory_order_acquire);
885 auto status =
step_token(sampler, generated, on_token, params.
stop);
886 if (status ==
"continue") {
889 result.
finish_reason = (status ==
"error") ?
"error" :
"stop";
890 if (status ==
"error") {
901 llama_sampler_free(sampler);
917 const std::vector<Message>& messages)
919 for (
const auto& msg : messages) {
920 if (msg.role ==
"system") {
940 const std::vector<llama_token>& tokens,
int start_offset)
942 int total =
static_cast<int>(tokens.size());
943 if (start_offset >= total) {
return true; }
945 int n_batch = llama_n_batch(
ctx_);
946 int n_remaining = total - start_offset;
947 for (
int off = 0; off < n_remaining; off += n_batch) {
948 int chunk = std::min(n_batch, n_remaining - off);
949 llama_batch batch = llama_batch_get_one(
950 const_cast<llama_token*
>(tokens.data())
951 + start_offset + off,
953 if (llama_decode(
ctx_, batch) != 0) {
954 logger->error(
"Decode chunk failed (start={}, off={}, "
955 "chunk={})", start_offset, off, chunk);
980 const std::vector<llama_token>& tokens)
982 auto* mem = llama_get_memory(
ctx_);
983 llama_memory_clear(mem,
true);
985 size_t restored = llama_state_seq_set_data(
988 logger->warn(
"KV state restore failed, falling back to full prefill");
1008 const CacheKey& key,
int prefix_tokens)
1010 size_t state_size = llama_state_seq_get_size(
ctx_, 0);
1011 if (state_size == 0) {
1015 std::vector<uint8_t> buf(state_size);
1016 size_t written = llama_state_seq_get_data(
1017 ctx_, buf.data(), buf.size(), 0);
1019 buf.resize(written);
1033 const std::vector<Message>& messages,
1036 std::vector<Message> sys_msgs;
1037 for (
const auto& msg : messages) {
1038 if (msg.role ==
"system") {
1039 sys_msgs.push_back(msg);
1042 if (sys_msgs.empty()) {
1047 auto sys_tokens =
tokenize(sys_prompt,
true);
1048 return static_cast<int>(sys_tokens.size());
1073 const std::vector<llama_token>& tokens,
1077 int total =
static_cast<int>(tokens.size());
1078 if (prefix_tokens <= 0 || prefix_tokens >= total) {
1085 std::vector<llama_token> prefix(
1086 tokens.begin(), tokens.begin() + prefix_tokens);
1115 const std::vector<llama_token>& tokens,
1116 const std::string& system_prompt,
1117 const std::vector<Message>& messages,
1122 && !system_prompt.empty();
1124 if (!cache_enabled) {
1129 system_prompt,
config().path.string());
1132 if (cached !=
nullptr) {
1134 logger->info(
"Prompt cache HIT: {} bytes, {} prefix tokens",
1140 logger->warn(
"Cache restore failed, falling back to full prefill");
1142 logger->info(
"Prompt cache MISS: processing full prompt");
1158bool any_image_in(
const std::vector<Message>& messages) {
1159 for (
const auto& m : messages) {
1160 if (
has_images(m.content_parts)) {
return true; }
1177std::vector<Message> strip_image_parts(
1178 const std::vector<Message>& messages) {
1179 std::vector<Message> out = messages;
1180 for (
auto& m : out) {
1181 if (m.content_parts.empty()) {
continue; }
1183 m.content_parts.clear();
1205std::vector<Message> substitute_image_markers(
1206 const std::vector<Message>& messages,
1207 ::mtmd_context* ctx,
1208 std::vector<::mtmd_bitmap*>& bitmaps_out) {
1209 std::vector<Message> out;
1210 out.reserve(messages.size());
1211 const std::string marker = mtmd_default_marker();
1212 for (
const auto& m : messages) {
1215 if (m.content_parts.empty()) {
1216 copy.content = m.content;
1217 out.push_back(std::move(copy));
1221 for (
const auto& p : m.content_parts) {
1226 ::mtmd_bitmap* bm =
nullptr;
1227 if (!p.image_path.empty()) {
1228 bm = mtmd_helper_bitmap_init_from_file(
1229 ctx, p.image_path.c_str());
1231 if (bm ==
nullptr) {
return {}; }
1232 bitmaps_out.push_back(bm);
1235 copy.content = std::move(built);
1236 out.push_back(std::move(copy));
1254 const std::string& prompt,
1255 const std::vector<::mtmd_bitmap*>& bitmaps,
1256 std::string& err_msg)
1258 llama_memory_clear(llama_get_memory(
ctx_),
true);
1259 ::mtmd_input_text mt{prompt.c_str(),
true,
true};
1260 auto* chunks = mtmd_input_chunks_init();
1261 std::vector<const ::mtmd_bitmap*> bm_cptrs(
1262 bitmaps.begin(), bitmaps.end());
1263 int32_t tok_rc = mtmd_tokenize(
1264 mtmd_ctx_, chunks, &mt, bm_cptrs.data(), bm_cptrs.size());
1266 mtmd_input_chunks_free(chunks);
1267 err_msg =
"mtmd_tokenize failed (rc="
1268 + std::to_string(tok_rc) +
")";
1271 llama_pos new_n_past = 0;
1272 int32_t eval_rc = mtmd_helper_eval_chunks(
1274 static_cast<int32_t
>(
config().n_batch),
1276 mtmd_input_chunks_free(chunks);
1278 err_msg =
"mtmd_helper_eval_chunks failed (rc="
1279 + std::to_string(eval_rc) +
")";
1282 logger->info(
"Multimodal prefill complete: n_past={}", new_n_past);
1298 std::function<
void(std::string_view token)> on_token,
1299 std::atomic<bool>* cancel,
1300 const std::chrono::steady_clock::time_point& t0)
1304 std::string generated;
1305 int n_generated = 0;
1307 if (cancel !=
nullptr
1308 && cancel->load(std::memory_order_acquire)) {
1314 sampler, generated, on_token, params.
stop);
1315 if (status ==
"continue") { ++n_generated;
continue; }
1316 result.
finish_reason = (status ==
"error") ?
"error" :
"stop";
1317 if (status ==
"error") {
1326 llama_sampler_free(sampler);
1329 finalize_result(result, t0);
1339 const std::vector<Message>& messages,
1341 std::function<
void(std::string_view token)> on_token,
1342 std::atomic<bool>* cancel)
1344 auto t0 = entropic::log::now();
1345 std::vector<::mtmd_bitmap*> bitmaps;
1346 auto marked = substitute_image_markers(
1348 if (marked.empty()) {
1349 for (
auto* b : bitmaps) { mtmd_bitmap_free(b); }
1353 "mtmd_helper_bitmap_init_from_file failed";
1357 logger->info(
"Multimodal generate: {} images, prompt={} chars, max_tokens={}",
1358 bitmaps.size(), prompt.size(), params.
max_tokens);
1359 std::string prefill_err;
1361 for (
auto* b : bitmaps) { mtmd_bitmap_free(b); }
1389 const std::vector<Message>& messages,
1392 if (!any_image_in(messages)) {
1398 logger->warn(
"Image content present but model has no vision "
1399 "capability — stripping image parts");
1409 const std::vector<Message>& messages,
1412 auto t0 = entropic::log::now();
1414 auto tokens =
tokenize(prompt,
true);
1417 logger->info(
"Generate: {} input tokens, max_tokens={}",
1419 log_sampler_config(params);
1425 llama_sampler_free(sampler);
1426 return prefill_error();
1429 std::string generated;
1430 int n_generated = 0;
1431 std::function<void(std::string_view)> no_cb =
nullptr;
1435 sampler, generated, no_cb, params.
stop);
1436 if (status ==
"continue") { ++n_generated; }
1439 (status ==
"error") ?
"error" :
"stop";
1440 if (status ==
"error") {
1452 llama_sampler_free(sampler);
1455 finalize_result(result, t0);
1470 const std::vector<Message>& messages,
1472 std::function<
void(std::string_view token)> on_token,
1473 std::atomic<bool>& cancel)
1475 if (!any_image_in(messages)) {
1477 messages, params, on_token, cancel);
1482 logger->warn(
"Image content present but model has no vision "
1483 "capability — stripping image parts");
1485 strip_image_parts(messages), params, on_token, cancel);
1494 const std::vector<Message>& messages,
1496 std::function<
void(std::string_view token)> on_token,
1497 std::atomic<bool>& cancel)
1499 auto t0 = entropic::log::now();
1501 auto tokens =
tokenize(prompt,
true);
1503 logger->info(
"Stream: {} input tokens, max_tokens={}",
1505 log_sampler_config(params);
1510 llama_sampler_free(sampler);
1511 return prefill_error();
1513 std::string generated;
1514 int n_generated = 0;
1516 if (cancel.load(std::memory_order_acquire)) {
1522 sampler, generated, on_token, params.
stop);
1523 if (status ==
"continue") { ++n_generated; }
1526 (status ==
"error") ?
"error" :
"stop";
1527 if (status ==
"error") {
1537 llama_sampler_free(sampler);
1540 finalize_result(result, t0);
1557 const std::vector<Message>& ,
1559 std::function<
void(std::string_view)> ,
1560 std::atomic<bool>& )
1565 "LlamaCppBackend speculative requires an explicit draft "
1566 "backend handle — orchestrator dispatches via "
1567 "generate_speculative_with_draft";
1586common_params_sampling to_common_sampling(
1588 common_params_sampling cps;
1590 cps.top_k = params.
top_k;
1591 cps.top_p = params.
top_p;
1593 if (params.
seed >= 0) {
1594 cps.seed =
static_cast<uint32_t
>(params.
seed);
1606 cps.samplers = {COMMON_SAMPLER_TYPE_PENALTIES,
1607 COMMON_SAMPLER_TYPE_TOP_K,
1608 COMMON_SAMPLER_TYPE_TOP_P};
1610 cps.samplers.push_back(COMMON_SAMPLER_TYPE_TEMPERATURE);
1613 cps.dry_multiplier = 0.0f;
1614 cps.top_n_sigma = -1.0f;
1634bool spec_prefill_minus_last(
1635 llama_context* ctx,
const std::vector<llama_token>& tokens) {
1636 int total =
static_cast<int>(tokens.size()) - 1;
1637 if (total <= 0) {
return true; }
1638 int n_batch = llama_n_batch(ctx);
1639 for (
int off = 0; off < total; off += n_batch) {
1640 int chunk = std::min(n_batch, total - off);
1641 llama_batch batch = llama_batch_get_one(
1642 const_cast<llama_token*
>(tokens.data()) + off, chunk);
1643 if (llama_decode(ctx, batch) != 0) {
return false; }
1655 r.error_code = code;
1656 r.error_message = std::move(msg);
1657 r.finish_reason =
"error";
1671 common_speculative* spec =
nullptr;
1672 common_sampler* smpl =
nullptr;
1673 llama_context* ctx_tgt =
nullptr;
1674 llama_context* ctx_dft =
nullptr;
1675 llama_batch batch_tgt{};
1676 bool batch_initialized =
false;
1677 llama_seq_id seq_id = 0;
1679 llama_token id_last = 0;
1680 std::vector<llama_token> prompt_tgt;
1681 std::vector<llama_token> draft;
1682 std::string generated;
1683 int n_generated = 0;
1686 bool has_eos =
false;
1687 std::string finish_reason;
1689 std::string error_message;
1698 bool use_ckpt_tgt =
false;
1699 bool use_ckpt_dft =
false;
1700 common_prompt_checkpoint ckpt;
1710 if (state.spec) { common_speculative_free(state.spec); }
1711 if (state.smpl) { common_sampler_free(state.smpl); }
1712 if (state.batch_initialized) {
1713 llama_batch_free(state.batch_tgt);
1724 common_batch_clear(state.batch_tgt);
1725 common_batch_add(state.batch_tgt, state.id_last,
1726 state.n_past, {state.seq_id},
true);
1727 int pos = state.n_past + 1;
1728 for (
auto draft_token : state.draft) {
1729 common_batch_add(state.batch_tgt, draft_token, pos,
1730 {state.seq_id},
true);
1744 int rc_tgt = llama_decode(state.ctx_tgt, state.batch_tgt);
1746 logger->error(
"Speculative target decode failed: rc={}, "
1747 "n_past={}, draft_size={}",
1748 rc_tgt, state.n_past, state.draft.size());
1750 state.error_message =
"target llama_decode failed";
1751 state.finish_reason =
"error";
1754 int rc_dft = llama_decode(state.ctx_dft, state.batch_tgt);
1756 logger->error(
"Speculative draft decode failed: rc={}, "
1757 "n_past={}, draft_size={}",
1758 rc_dft, state.n_past, state.draft.size());
1760 state.error_message =
"draft llama_decode failed";
1761 state.finish_reason =
"error";
1774 auto& dp = common_speculative_get_draft_params(
1775 state.spec, state.seq_id);
1778 dp.n_past = state.n_past;
1779 dp.id_last = state.id_last;
1780 dp.prompt = &state.prompt_tgt;
1781 dp.result = &state.draft;
1782 common_speculative_draft(state.spec);
1783 return static_cast<int>(state.draft.size());
1801 const llama_vocab* vocab,
int max_tokens,
1802 std::function<
void(std::string_view)>& on_token,
1803 std::atomic<bool>& cancel)
1806 state.prompt_tgt.push_back(state.id_last);
1808 state.n_generated++;
1809 if (llama_vocab_is_eog(vocab,
id)) {
1810 state.has_eos =
true;
1811 state.finish_reason =
"stop";
1814 const std::string piece =
1815 common_token_to_piece(state.ctx_tgt,
id);
1816 state.generated += piece;
1817 if (on_token) { on_token(piece); }
1818 if (cancel.load(std::memory_order_acquire)) {
1820 state.finish_reason =
"cancelled";
1822 }
else if (state.n_generated >= max_tokens) {
1823 state.finish_reason =
"length";
1842 state.ckpt.update_pos(
1843 static_cast<int64_t
>(state.prompt_tgt.size()),
1844 llama_memory_seq_pos_min(
1845 llama_get_memory(state.ctx_tgt), state.seq_id),
1846 llama_memory_seq_pos_max(
1847 llama_get_memory(state.ctx_tgt), state.seq_id));
1848 if (state.use_ckpt_dft) {
1849 state.ckpt.update_dft(state.ctx_dft, state.seq_id,
1850 LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1851 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
1862 if (state.use_ckpt_tgt && !state.draft.empty()) {
1863 state.ckpt.update_tgt(state.ctx_tgt, state.seq_id,
1864 LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1865 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
1876 constexpr auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1877 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE;
1878 if (state.use_ckpt_dft) {
1879 state.ckpt.load_dft(state.ctx_dft, state.seq_id, flags);
1881 llama_memory_seq_rm(llama_get_memory(state.ctx_dft),
1882 state.seq_id, state.ckpt.pos_max + 1, -1);
1896 std::vector<llama_token>& ids) {
1897 constexpr auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1898 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE;
1899 state.draft = std::move(ids);
1900 state.ckpt.load_tgt(state.ctx_tgt, state.seq_id, flags);
1901 llama_memory_seq_rm(llama_get_memory(state.ctx_tgt),
1902 state.seq_id, state.ckpt.pos_max + 1, -1);
1903 state.ckpt.load_dft(state.ctx_dft, state.seq_id, flags);
1904 llama_memory_seq_rm(llama_get_memory(state.ctx_dft),
1905 state.seq_id, state.ckpt.pos_max + 1, -1);
1906 state.prompt_tgt.resize(
static_cast<size_t>(state.ckpt.n_tokens));
1907 state.n_past =
static_cast<int>(state.prompt_tgt.size());
1909 common_sampler_free(state.smpl);
1910 state.smpl = smpl_save;
1938 llama_memory_seq_rm(llama_get_memory(state.ctx_tgt),
1939 state.seq_id, state.n_past, -1);
1940 llama_memory_seq_rm(llama_get_memory(state.ctx_dft),
1941 state.seq_id, state.n_past, -1);
1952 const std::vector<llama_token>& ids,
1953 const llama_vocab* vocab,
int max_tokens,
1954 std::function<
void(std::string_view)>& on_token,
1955 std::atomic<bool>& cancel) {
1957 for (
auto id : ids) {
1959 state,
id, vocab, max_tokens, on_token, cancel);
1960 if (!signal.empty()) { stop =
true;
break; }
1988 if (!state.draft.empty()) {
1989 return static_cast<int>(state.draft.size());
2005 const llama_vocab* vocab,
2007 std::function<
void(std::string_view)>& on_token,
2008 std::atomic<bool>& cancel)
2014 common_sampler* smpl_save =
nullptr;
2015 if (state.use_ckpt_tgt) {
2016 smpl_save = common_sampler_clone(state.smpl);
2018 auto ids = common_sampler_sample_and_accept_n(
2019 state.smpl, state.ctx_tgt, state.draft);
2020 int accepted =
static_cast<int>(ids.size()) - 1;
2021 if (accepted < 0) { accepted = 0; }
2025 if (state.use_ckpt_tgt
2026 &&
static_cast<int>(ids.size()) - 1
2027 <
static_cast<int>(state.draft.size())) {
2031 if (smpl_save) { common_sampler_free(smpl_save); }
2033 common_speculative_accept(state.spec, state.seq_id, accepted);
2034 state.n_drafted += draft_size_before;
2035 state.n_accepted += accepted;
2041 state.n_past +=
static_cast<int>(ids.size());
2044 state, ids, vocab, max_tokens, on_token, cancel);
2045 state.draft.clear();
2063 bool target_active,
bool draft_active,
2064 llama_context* ctx_tgt, llama_context* ctx_dft) {
2071 const llama_model* model_tgt = llama_get_model(ctx_tgt);
2072 int cap_tgt = common_context_can_seq_rm(ctx_tgt);
2073 int cap_dft = common_context_can_seq_rm(ctx_dft);
2074 logger->info(
"Speculative seq_rm capability: target={}, draft={} "
2075 "(0=NO, 1=PART, 2=FULL)", cap_tgt, cap_dft);
2076 if (!target_active || !draft_active) {
2077 err =
"speculative requires ACTIVE target + draft";
2078 }
else if (llama_model_is_recurrent(model_tgt)
2079 || llama_model_is_hybrid(model_tgt)) {
2080 err =
"speculative refused: architecture (target is "
2081 "recurrent or hybrid; see proposal Implementation "
2083 }
else if (cap_tgt == COMMON_CONTEXT_SEQ_RM_TYPE_NO
2084 || cap_dft == COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
2087 err =
"speculative kernel requires at least FULL seq_rm "
2088 "(target/draft reported NO seq_rm at all)";
2124 const std::string& draft_path) {
2125 auto common_sampling = to_common_sampling(params);
2126 state.smpl = common_sampler_init(model_tgt, common_sampling);
2127 if (!state.smpl) {
return "common_sampler_init failed"; }
2129 common_params_speculative spec_params;
2130 spec_params.types = {COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE};
2131 spec_params.draft.n_max = (n_draft_max > 0) ? n_draft_max : 16;
2132 spec_params.draft.ctx_tgt = state.ctx_tgt;
2133 spec_params.draft.ctx_dft = state.ctx_dft;
2137 spec_params.draft.mparams.path = draft_path;
2138 state.spec = common_speculative_init(spec_params, 1);
2140 common_sampler_free(state.smpl);
2141 state.smpl =
nullptr;
2142 return "common_speculative_init failed";
2145 common_speculative_begin(state.spec, state.seq_id, state.prompt_tgt);
2146 state.batch_tgt = llama_batch_init(llama_n_batch(state.ctx_tgt), 0, 1);
2147 state.batch_initialized =
true;
2151 state.use_ckpt_tgt = common_context_can_seq_rm(state.ctx_tgt)
2152 == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
2153 state.use_ckpt_dft = common_context_can_seq_rm(state.ctx_dft)
2154 == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
2165 const std::vector<llama_token>& tokens,
2167 const std::string& draft_path) {
2168 state.id_last = tokens.back();
2169 state.prompt_tgt.assign(tokens.begin(), tokens.end() - 1);
2170 state.n_past =
static_cast<int>(tokens.size()) - 1;
2172 llama_memory_clear(llama_get_memory(state.ctx_tgt),
true);
2173 llama_memory_clear(llama_get_memory(state.ctx_dft),
true);
2175 if (!spec_prefill_minus_last(state.ctx_tgt, tokens)
2176 || !spec_prefill_minus_last(state.ctx_dft, tokens)) {
2177 return "speculative prefill failed";
2180 state, model_tgt, params, n_draft_max, draft_path);
2191 std::function<
void(std::string_view)>& on_token,
2192 std::atomic<bool>& cancel) {
2193 while (state.n_generated < max_tokens) {
2194 if (cancel.load(std::memory_order_acquire)) {
2196 state.finish_reason =
"cancelled";
2200 on_token, cancel)) {
2204 if (state.finish_reason.empty()) {
2205 state.finish_reason = (state.n_generated >= max_tokens)
2206 ?
"length" :
"stop";
2223 std::chrono::steady_clock::time_point t0) {
2225 result.
content = state.generated;
2231 entropic::log::elapsed_ms(t0, entropic::log::now());
2232 if (state.n_drafted > 0) {
2233 const float accept_rate =
2234 static_cast<float>(state.n_accepted)
2235 /
static_cast<float>(state.n_drafted);
2236 logger->info(
"Speculative: generated={}, drafted={}, "
2237 "accepted={}, accept_rate={:.3f}",
2238 state.n_generated, state.n_drafted,
2239 state.n_accepted, accept_rate);
2286 llama_context* ctx_tgt, llama_context* ctx_dft, llama_model* model_tgt,
2288 std::function<
void(std::string_view)>& on_token,
2289 std::atomic<bool>& cancel,
int n_draft_max,
2290 const std::string& draft_path,
2291 std::chrono::steady_clock::time_point t0) {
2293 state.ctx_tgt = ctx_tgt;
2294 state.ctx_dft = ctx_dft;
2295 auto init_err =
spec_init_run(state, model_tgt, tokens, params,
2296 n_draft_max, draft_path);
2297 if (!init_err.empty()) {
2300 std::move(init_err));
2313 const std::vector<Message>& messages,
2315 std::function<
void(std::string_view)> on_token,
2316 std::atomic<bool>& cancel,
2319 const std::string& draft_path)
2321 auto t0 = entropic::log::now();
2325 if (!pre_err.empty()) {
2327 (pre_err.find(
"requires ACTIVE") != std::string::npos)
2330 result = spec_error(code, std::move(pre_err));
2333 auto tokens =
tokenize(prompt,
true);
2334 if (tokens.size() < 2) {
2336 "speculative prompt must have at least 2 tokens");
2338 logger->info(
"Speculative: {} input tokens, max_tokens={}, "
2340 tokens.size(), params.
max_tokens, n_draft_max);
2343 cancel, n_draft_max, draft_path, t0);
2358 const std::string& prompt,
2361 auto t0 = entropic::log::now();
2362 auto tokens =
tokenize(prompt,
false);
2364 logger->info(
"Complete: {} input tokens, max_tokens={}",
2366 log_sampler_config(params);
2367 auto result =
decode_loop(tokens, params,
nullptr,
nullptr);
2368 finalize_result(result, t0);
2394 int idx =
static_cast<int>(cap);
2396 if (idx < 0 || idx >= count) {
2403 static constexpr bool always[] = {
2404 false,
false,
true,
true,
true,
true,
2405 false,
true,
true,
false,
false,
true,
2410 bool result = always[idx];
2443 bi.
name =
"llama.cpp";
2444#if defined(ENTROPIC_BACKEND_CUDA)
2446#elif defined(ENTROPIC_BACKEND_VULKAN)
2461 char desc[256] = {};
2462 llama_model_desc(
model_, desc,
sizeof(desc));
2476 if (
ctx_ ==
nullptr) {
2479 auto mem = llama_get_memory(
ctx_);
2481 llama_memory_clear(mem,
true);
2483 llama_memory_seq_rm(mem, seq_id, -1, -1);
std::string last_error_
Last error message for diagnostics.
bool is_active() const
True when state is ACTIVE.
ModelState state() const
Current lifecycle state (lock-free read).
const ModelConfig & config() const
Stored model config.
int context_length() const
Model's context window size.
LlamaCppBackend — common llama.cpp patterns (15% layer).
bool load_gpu_model()
Load the GGUF model onto the GPU (do_activate step 1).
bool do_load(const ModelConfig &config) override
Load model into CPU RAM (COLD → WARM).
bool do_supports(BackendCapability cap) const override
Declare llama.cpp backend capabilities.
GenerationResult decode_loop(const std::vector< llama_token > &tokens, const GenerationParams ¶ms, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
Core decode loop — shared by generate and streaming.
bool is_recurrent_
True if loaded model is recurrent (GDN/Mamba/RWKV).
LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens) override
Evaluate per-token log-probabilities via sequential decode.
std::string do_backend_name() const override
Return backend name.
std::unique_ptr< PromptCache > prompt_cache_
KV prefix cache (v1.8.3)
std::string step_token(llama_sampler *sampler, std::string &generated, std::function< void(std::string_view)> &on_token, const std::vector< std::string > &stop)
Generate one token and append to output.
GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams ¶ms) override
Generate a complete response using chat template.
GenerationResult do_complete(const std::string &prompt, const GenerationParams ¶ms) override
Raw text completion without chat template.
std::vector< llama_token > tokenize(const std::string &text, bool add_special) const
Tokenize text using model vocabulary.
bool create_inference_context()
Create the llama context + prompt cache (do_activate step 2).
const llama_vocab * vocab_
Vocabulary (from model_)
int compute_prefix_token_count(const std::vector< Message > &messages, const GenerationParams ¶ms)
Compute token count of system messages only.
std::string detokenize(llama_token token) const
Detokenize a single token.
void init_mmproj_if_configured()
Initialize the libmtmd context if mmproj is configured.
GenerationResult generate_speculative_with_draft(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, LlamaCppBackend &draft, int n_draft_max, const std::string &draft_path)
Speculative-decoding kernel with explicit draft backend.
llama_context * ctx_
Inference context (ACTIVE)
bool run_prefill(const std::vector< llama_token > &tokens)
Run batched prefill on input tokens.
GenerationResult run_sampling_loop(const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel, const std::chrono::steady_clock::time_point &t0)
Sample tokens until stop / max_tokens / cancel.
bool restore_cached_prefix(const CacheEntry *cached, const std::vector< llama_token > &tokens)
Restore KV state from cache and decode remaining tokens.
void save_prefix_to_cache(const CacheKey &key, int prefix_tokens)
Capture seq 0 KV state and store under the given key.
std::vector< int32_t > tokenize_text(const std::string &text) const override
Tokenize text to token IDs using model vocabulary.
bool is_recurrent() const
Check if loaded model is recurrent.
entropic_error_t mtmd_prefill(const std::string &prompt, const std::vector<::mtmd_bitmap * > &bitmaps, std::string &err_msg)
Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.
bool run_prefill_cached(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams ¶ms)
Run prefill with prompt cache integration.
GenerationResult do_generate_text_only(const std::vector< Message > &messages, const GenerationParams ¶ms)
Text-only batch generation (extracted from do_generate).
std::string apply_chat_template(const std::vector< Message > &messages, const GenerationParams ¶ms) const
Apply chat template to messages.
GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Streaming generation with per-token callback.
bool has_vision_
Cached mtmd_support_vision(mtmd_ctx_) result.
bool decode_tokens_from(const std::vector< llama_token > &tokens, int start_offset)
Decode tokens starting at a given offset.
void release_temp_seq_id(llama_seq_id seq_id)
Release a temporary sequence ID back to the pool.
int do_count_tokens(const std::string &text) const override
Count tokens in text.
::mtmd_context * mtmd_ctx_
libmtmd context, or nullptr if no mmproj loaded.
GenerationResult generate_multimodal(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel)
Multimodal generation core (v1.9.11 Phases 5–7).
std::mutex seq_id_mutex_
Guards temp seq_id pool (v1.9.10)
bool do_clear_state(int seq_id) override
Clear KV cache or recurrent hidden state.
static float extract_token_logprob(const float *logits, int32_t next_token, int n_vocab)
Extract log-probability for a token from logits.
void do_deactivate() override
Deactivate: free context, reload model CPU-only.
llama_sampler * create_sampler(const GenerationParams ¶ms) const
Create sampler chain from generation params.
BackendInfo do_info() const override
Populate backend metadata from llama.cpp model.
bool do_activate() override
Activate model on GPU (WARM → ACTIVE).
bool prefill_and_cache_prefix(const std::vector< llama_token > &tokens, int prefix_tokens, const CacheKey &key)
Two-pass prefill: prefix-only prefill → save → rest.
llama_seq_id allocate_temp_seq_id()
Allocate a temporary sequence ID for evaluation.
PromptCacheConfig prompt_cache_config_
Cache config (v1.8.3)
void do_unload() override
Full unload — free all resources, clear prompt cache.
llama_model * model_
Loaded model (WARM+)
~LlamaCppBackend() override
Free llama.cpp + mtmd resources on destruction.
std::vector< llama_seq_id > free_seq_ids_
Available temporary seq_ids (v1.9.10)
GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel...
GenerationResult do_generate_streaming_text_only(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Text-only streaming generation (extracted from streaming).
static std::string extract_system_prompt(const std::vector< Message > &messages)
Extract the system prompt from messages.
static CacheKey make_key(std::string_view prompt_text, std::string_view model_path)
Compute a cache key from prompt text and model path.
entropic_error_t
Error codes returned by all C API functions.
@ ENTROPIC_ERROR_CANCELLED
Operation cancelled via cancel token.
@ ENTROPIC_ERROR_IMAGE_LOAD_FAILED
Image file could not be read or decoded (v1.9.11)
@ ENTROPIC_ERROR_NOT_SUPPORTED
Capability not supported by this backend (v1.9.13)
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
@ ENTROPIC_ERROR_INVALID_STATE
Operation not valid in current state (e.g., generate before activate)
LlamaCppBackend — llama.cpp C API integration.
spdlog initialization and logger access.
auto now()
Get current time for timing measurements.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
double elapsed_ms(std::chrono::steady_clock::time_point start, std::chrono::steady_clock::time_point end)
Compute elapsed milliseconds between two time points.
Activate model on GPU (WARM → ACTIVE).
@ IMAGE
Image content (local path or data URI)
BackendCapability
Capabilities that an inference backend may or may not support.
@ SPECULATIVE_DECODING
Speculative decoding compatibility.
@ HIDDEN_STATE
Recurrent hidden state management (save/load/reset)
@ VISION
Vision / multimodal input (v1.9.11)
@ KV_CACHE
KV cache state management (save/load/clear)
@ AUDIO
Audio input via mtmd audio projector (gh#53, v2.3.0)
@ _COUNT
Sentinel — must be last. Used for iteration/array sizing.
bool has_images(const std::vector< ContentPart > &parts)
Check if content parts contain any image parts.
static bool spec_decode_both(SpeculativeRunState &state)
Decode the speculative batch on both contexts.
std::string extract_text(const std::vector< ContentPart > &parts)
Extract concatenated text from content parts.
static void spec_ckpt_save_tgt(SpeculativeRunState &state)
Snapshot target state right before the target decode of the speculative batch (when use_ckpt_tgt + no...
static void spec_trim_rejected_drafts(SpeculativeRunState &state)
Clear any stale KV positions left by rejected draft tokens.
static bool spec_commit_accepted(SpeculativeRunState &state, const std::vector< llama_token > &ids, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Walk accepted ids, emit tokens via callback, update state.
static std::string spec_emit_token(SpeculativeRunState &state, llama_token id, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Emit on_token for one accepted id, updating state and returning a stop signal when terminating condit...
static void spec_rollback_partial(SpeculativeRunState &state, common_sampler *smpl_save, std::vector< llama_token > &ids)
Partial-acceptance rollback: restore both contexts and the sampler to their pre-draft state,...
static std::string spec_check_preconditions(bool target_active, bool draft_active, llama_context *ctx_tgt, llama_context *ctx_dft)
Validate speculative preconditions and reject NO-seq_rm.
@ COLD
On disk only, no RAM consumed.
static std::string concat_messages_fallback(const std::vector< Message > &messages)
Plain "role: content" join used when templating fails.
static int spec_run_draft(SpeculativeRunState &state)
Trigger draft generation via common_speculative_draft.
static void spec_run_loop(SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Run the accept-round loop until completion / EOS / cancel.
static std::vector< llama_chat_message > to_llama_chat(const std::vector< Message > &messages)
Apply GGUF-embedded chat template to messages.
static GenerationResult spec_finalize(SpeculativeRunState &state, std::chrono::steady_clock::time_point t0)
Speculative kernel against an explicit draft backend.
static bool spec_accept_round(SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Run one speculative accept round; return false to stop.
static GenerationResult spec_run_from_tokens(llama_context *ctx_tgt, llama_context *ctx_dft, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams ¶ms, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel, int n_draft_max, const std::string &draft_path, std::chrono::steady_clock::time_point t0)
Public entry point for the speculative-decoding kernel.
static void spec_build_batch(SpeculativeRunState &state)
Build the target batch [id_last, draft0, ..., draftN-1].
static int spec_prepare_draft(SpeculativeRunState &state)
Drive one accept round: optional draft generation, decode on both contexts, sample-and-accept,...
static void spec_ckpt_save_dft(SpeculativeRunState &state)
Drive one accept round: draft → decode → sample-and-accept → emit tokens.
static void spec_cleanup(SpeculativeRunState &state)
Free everything allocated by the kernel.
static std::string spec_init_sampler_and_decoder(SpeculativeRunState &state, llama_model *model_tgt, const GenerationParams ¶ms, int n_draft_max, const std::string &draft_path)
Initialize the kernel state: clear KV, prefill, sampler, speculative context, batch,...
static void spec_ckpt_restore_dft(SpeculativeRunState &state)
Restore the draft's pre-draft state so the upcoming target-batch decode on the draft re-fills cleanly...
static std::string spec_init_run(SpeculativeRunState &state, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams ¶ms, int n_draft_max, const std::string &draft_path)
Initialize speculative run state (prefill + sampler + decoder).
Backend metadata for introspection.
size_t ram_bytes
RAM consumed by loaded model (bytes). 0 if COLD.
int max_context_length
Maximum context length.
size_t parameter_count
Number of parameters (from model metadata).
std::string architecture
Architecture family of the loaded model.
std::string compute_device
"cuda", "vulkan", "cpu", "npu"
std::string name
Backend identifier (e.g. "llama.cpp", "axcl")
std::string quantization
Quantization type (e.g. "IQ3_XXS", "Q8_0", "fp16").
size_t vram_bytes
VRAM consumed by loaded model (bytes). 0 if COLD.
std::string model_format
"gguf", "axmodel", "onnx", etc.
Single cached KV state snapshot.
std::vector< uint8_t > data
Raw KV cache bytes.
size_t data_size
data.size() for quick byte accounting
int token_count
Prompt tokens covered by this entry.
64-bit hash used as cache lookup key.
Generation parameters for a single inference call.
std::string grammar
GBNF grammar string (empty = unconstrained)
float repeat_penalty
Repetition penalty.
float temperature
Sampling temperature.
int max_tokens
Maximum tokens to generate.
float top_p
Nucleus sampling threshold.
int seed
RNG seed for reproducible sampling.
std::vector< std::string > stop
Stop sequences.
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
double generation_time_ms
Wall-clock generation time.
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string content
Generated text (cleaned by adapter)
std::string error_message
Error description (empty if no error)
int token_count
Generated token count.
Per-token log-probability evaluation result.
std::vector< float > logprobs
Log-prob for each token transition (N-1 values)
int n_logprobs
Number of logprob values (n_tokens - 1)
int n_tokens
Number of input tokens.
float total_logprob
Sum of all logprob values.
float perplexity
exp(-mean(logprobs)) — lower = less surprising
std::vector< int32_t > tokens
Input tokens echoed back for verification.
Model configuration for a single tier.
std::filesystem::path mmproj_path
Vision projector GGUF path.
int gpu_layers
GPU offload layers (-1 = all)
int context_length
Context window size (512–131072)
std::filesystem::path path
Resolved model file path.
int n_threads
CPU threads (0 = auto-detect)
std::string cache_type_k
KV cache key quantization type.
std::string cache_type_v
KV cache value quantization type.
int n_batch
Batch size for prompt processing.
bool flash_attn
Enable flash attention.
bool use_mlock
Lock model in system RAM.
size_t max_bytes
Maximum cache RAM (512 MB default)
bool log_hits
Log cache hit/miss at INFO level.
bool enabled
Master switch (false = no caching)
Bundles per-kernel-run mutable state to keep the loop body focused on its responsibility (knots: cogn...