Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
llama_cpp_backend.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
16#include "llama_cpp_backend.h"
17
19
20#include <common.h>
21#include <sampling.h>
22#include <speculative.h>
23#include <mtmd.h>
24#include <mtmd-helper.h>
25
26#include <cmath>
27#include <cstring>
28#include <stdexcept>
29
30namespace entropic {
31
32namespace {
33
34auto logger = entropic::log::get("inference.llama_cpp");
35
44bool ends_with(const std::string& text, const std::string& suffix) {
45 return text.size() >= suffix.size()
46 && text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0;
47}
48
57bool check_stop_sequences(
58 const std::string& text,
59 const std::vector<std::string>& stop_sequences)
60{
61 for (const auto& stop : stop_sequences) {
62 if (!stop.empty() && ends_with(text, stop)) {
63 return true;
64 }
65 }
66 return false;
67}
68
75GenerationResult prefill_error() {
76 GenerationResult r;
77 r.error_code = ENTROPIC_ERROR_GENERATE_FAILED;
78 r.error_message = "Prefill decode failed";
79 r.finish_reason = "error";
80 return r;
81}
82
89void log_sampler_config(const GenerationParams& params) {
90 logger->info("Sampler: temp={:.2f}, top_k={}, top_p={:.2f}, "
91 "repeat_penalty={:.2f}, thinking={}",
92 params.temperature, params.top_k, params.top_p,
93 params.repeat_penalty, params.enable_thinking);
94}
95
103void finalize_result(GenerationResult& result,
104 std::chrono::steady_clock::time_point start_time)
105{
106 auto end = entropic::log::now();
107 result.generation_time_ms = entropic::log::elapsed_ms(
108 start_time, end);
109 if (result.token_count > 0 && result.generation_time_ms > 0.0) {
110 result.throughput_tok_s =
111 static_cast<double>(result.token_count)
112 / result.generation_time_ms * 1000.0;
113 }
114 logger->info("Generated: {} tokens, finish={}, {:.0f}ms, "
115 "{:.1f} tok/s",
116 result.token_count, result.finish_reason,
117 result.generation_time_ms, result.throughput_tok_s);
118 logger->info("Content:\n{}", result.content);
119}
120
130ggml_type parse_kv_cache_type(const std::string& s) {
131 static const std::pair<const char*, ggml_type> kTable[] = {
132 {"f16", GGML_TYPE_F16},
133 {"f32", GGML_TYPE_F32},
134 {"bf16", GGML_TYPE_BF16},
135 {"q8_0", GGML_TYPE_Q8_0},
136 {"q4_0", GGML_TYPE_Q4_0},
137 };
138 for (const auto& [name, type] : kTable) {
139 if (s == name) { return type; }
140 }
141 logger->warn("Unknown cache_type '{}' — defaulting to f16", s);
142 return GGML_TYPE_F16;
143}
144
145} // anonymous namespace
146
147// ── Lifecycle ──────────────────────────────────────────────
148
161 llama_model_params mparams = llama_model_default_params();
162 mparams.n_gpu_layers = 0;
163 mparams.use_mmap = true;
164 mparams.use_mlock = config.use_mlock;
165
166 model_ = llama_model_load_from_file(config.path.c_str(), mparams);
167 if (!model_) {
168 last_error_ = "llama_model_load_from_file failed: " + config.path.string();
169 return false;
170 }
171
172 vocab_ = llama_model_get_vocab(model_);
173 is_recurrent_ = llama_model_is_recurrent(model_);
174 logger->info("Model loaded (CPU): {} tokens in vocab, recurrent={}",
175 llama_vocab_n_tokens(vocab_), is_recurrent_);
176 return true;
177}
178
189namespace {
202llama_context_params build_cparams(const entropic::ModelConfig& cfg) {
203 llama_context_params c = llama_context_default_params();
204 c.n_ctx = static_cast<uint32_t>(cfg.context_length);
205 c.n_batch = static_cast<uint32_t>(cfg.n_batch);
206 c.n_threads = cfg.n_threads > 0
207 ? static_cast<uint32_t>(cfg.n_threads)
208 : std::thread::hardware_concurrency();
209 c.flash_attn_type = cfg.flash_attn
210 ? LLAMA_FLASH_ATTN_TYPE_ENABLED
211 : LLAMA_FLASH_ATTN_TYPE_DISABLED;
212 c.type_k = parse_kv_cache_type(cfg.cache_type_k);
213 c.type_v = parse_kv_cache_type(cfg.cache_type_v);
214 return c;
215}
216} // anonymous namespace
217
231 if (!load_gpu_model()) { return false; }
232 if (!create_inference_context()) { return false; }
234 return true;
235}
236
244 llama_model_params mparams = llama_model_default_params();
245 mparams.n_gpu_layers = config().gpu_layers;
246 mparams.use_mmap = true;
247 mparams.use_mlock = config().use_mlock;
248
249 if (!config().tensor_split.empty()) {
250 // TODO: parse tensor_split string into float array for multi-GPU
251 logger->warn("tensor_split not yet implemented, ignoring");
252 }
253
254 llama_model* new_model = llama_model_load_from_file(
255 config().path.c_str(), mparams);
256 if (!new_model) {
257 // llama.cpp returns null with no error string — the actual
258 // reason (OOM, CUDA init failure, GGUF parse error, etc.)
259 // only surfaces in ggml's log stream. Point the operator at
260 // it so multi-handle GPU failures (gh#58 v2.2.7 follow-up)
261 // are diagnosable without source-diving llama.cpp.
262 last_error_ = "Failed to reload model with GPU layers "
263 "(path=" + config().path.string()
264 + ", gpu_layers=" + std::to_string(config().gpu_layers)
265 + ") — check llama_ggml.log in the engine's log_dir "
266 "for the underlying llama.cpp/CUDA error";
267 return false;
268 }
269
270 if (model_) { llama_model_free(model_); }
271 model_ = new_model;
272 vocab_ = llama_model_get_vocab(model_);
273 return true;
274}
275
283 llama_context_params cparams = build_cparams(config());
284
285 ctx_ = llama_init_from_model(model_, cparams);
286 if (!ctx_) {
287 last_error_ = "llama_init_from_model failed";
288 return false;
289 }
290
291 logger->info("Context created: n_ctx={}, n_batch={}, "
292 "flash_attn={}, type_k={}, type_v={}",
293 config().context_length, config().n_batch,
294 config().flash_attn,
295 config().cache_type_k, config().cache_type_v);
296
297 // Initialize prompt cache if not already created
298 if (!prompt_cache_) {
299 prompt_cache_ = std::make_unique<PromptCache>(
301 logger->info("Prompt cache initialized: max_bytes={}",
303 }
304 return true;
305}
306
320 if (config().mmproj_path.empty()) {
321 has_vision_ = false;
322 return;
323 }
324 auto ctx_params = mtmd_context_params_default();
325 ctx_params.use_gpu = (config().gpu_layers != 0);
326 ctx_params.flash_attn_type = config().flash_attn
327 ? LLAMA_FLASH_ATTN_TYPE_ENABLED
328 : LLAMA_FLASH_ATTN_TYPE_DISABLED;
329 ctx_params.print_timings = false;
330 mtmd_ctx_ = mtmd_init_from_file(
331 config().mmproj_path.c_str(), model_, ctx_params);
332 if (mtmd_ctx_ == nullptr) {
333 logger->error("mtmd_init_from_file failed for {} — "
334 "continuing in text-only mode",
335 config().mmproj_path.string());
336 has_vision_ = false;
337 return;
338 }
339 has_vision_ = mtmd_support_vision(mtmd_ctx_);
340 logger->info("mmproj loaded from {} — vision={}",
341 config().mmproj_path.string(), has_vision_);
342}
343
350 // v2.1.8: mtmd holds a reference to the live llama_model — free
351 // it before the GPU model is unloaded below.
352 if (mtmd_ctx_ != nullptr) {
353 mtmd_free(mtmd_ctx_);
354 mtmd_ctx_ = nullptr;
355 has_vision_ = false;
356 }
357 if (ctx_) {
358 llama_free(ctx_);
359 ctx_ = nullptr;
360 }
361
362 // Reload model CPU-only for WARM state
363 llama_model_params mparams = llama_model_default_params();
364 mparams.n_gpu_layers = 0;
365 mparams.use_mmap = true;
366 mparams.use_mlock = config().use_mlock;
367
368 llama_model* cpu_model = llama_model_load_from_file(
369 config().path.c_str(), mparams);
370 if (cpu_model) {
371 llama_model_free(model_);
372 model_ = cpu_model;
373 vocab_ = llama_model_get_vocab(model_);
374 } else {
375 logger->warn("Failed to reload CPU model during deactivate, keeping GPU model");
376 }
377}
378
393
400 if (prompt_cache_) {
401 prompt_cache_->clear();
402 }
403 // v2.1.8: mtmd must be freed before the underlying llama_model.
404 if (mtmd_ctx_ != nullptr) {
405 mtmd_free(mtmd_ctx_);
406 mtmd_ctx_ = nullptr;
407 has_vision_ = false;
408 }
409 if (ctx_) {
410 llama_free(ctx_);
411 ctx_ = nullptr;
412 }
413 if (model_) {
414 llama_model_free(model_);
415 model_ = nullptr;
416 }
417 vocab_ = nullptr;
418}
419
420// ── Tokenization ───────────────────────────────────────────
421
430std::vector<llama_token> LlamaCppBackend::tokenize(
431 const std::string& text, bool add_special) const
432{
433 // First call: get required size
434 int n = llama_tokenize(vocab_, text.c_str(),
435 static_cast<int32_t>(text.size()),
436 nullptr, 0, add_special, true);
437 if (n < 0) {
438 n = -n;
439 }
440
441 std::vector<llama_token> tokens(static_cast<size_t>(n));
442 int actual = llama_tokenize(vocab_, text.c_str(),
443 static_cast<int32_t>(text.size()),
444 tokens.data(), n, add_special, true);
445 if (actual < 0) {
446 logger->error("Tokenization failed for text of length {}", text.size());
447 return {};
448 }
449 tokens.resize(static_cast<size_t>(actual));
450 return tokens;
451}
452
460std::string LlamaCppBackend::detokenize(llama_token token) const {
461 // special=false — special tokens don't render to surface text.
462 //
463 // History: v2.3.4 (gh#68) flipped this from `true` to `false`
464 // expecting it would fix Gemma 4's `<|im_end|>` content leak.
465 // It did NOT — Gemma 4 emits `<|im_end|>` as multi-token *regular*
466 // surface tokens (the GGUF tokenizer decomposes it into `<`, `|`,
467 // `im`, `_end`, `|>` or similar), not as a single special token.
468 // None of those individual tokens are classified as special, so
469 // this flag has no effect on them.
470 //
471 // The actual gh#68 fix lives in `Gemma4Adapter::parse_tool_calls`
472 // (v2.3.5) which scrubs chat-template markers from cleaned_content
473 // at the adapter layer — same surface gh#65 used for the
474 // `<|tool_call>` asymmetric-tag scrub.
475 //
476 // Keeping `special=false` as a defensive measure regardless: any
477 // future model that DOES emit a chat-template marker as a single
478 // special token would be filtered. Zero cost for the current
479 // model fleet. Stop semantics are independent of this flag —
480 // the streaming loop short-circuits on `llama_vocab_is_eog()`
481 // BEFORE calling detokenize.
482 char buf[256];
483 int n = llama_token_to_piece(vocab_, token, buf, sizeof(buf), 0, false);
484 if (n < 0) {
485 // Buffer too small — retry with exact size
486 std::vector<char> large(static_cast<size_t>(-n));
487 n = llama_token_to_piece(vocab_, token, large.data(),
488 static_cast<int32_t>(large.size()), 0, false);
489 if (n > 0) {
490 return std::string(large.data(), static_cast<size_t>(n));
491 }
492 return "";
493 }
494 return std::string(buf, static_cast<size_t>(n));
495}
496
504int LlamaCppBackend::do_count_tokens(const std::string& text) const {
505 auto tokens = tokenize(text, false);
506 return static_cast<int>(tokens.size());
507}
508
517 const std::string& text) const {
518 auto tokens = tokenize(text, true);
519 return {tokens.begin(), tokens.end()};
520}
521
522// ── Evaluation (v1.9.10) ──────────────────────────────────
523
539 const int32_t* tokens,
540 int n_tokens)
541{
542 int n_vocab = llama_vocab_n_tokens(vocab_);
543 LogprobResult result;
544 result.tokens.assign(tokens, tokens + n_tokens);
545 result.n_tokens = n_tokens;
546 result.n_logprobs = n_tokens - 1;
547 result.logprobs.reserve(result.n_logprobs);
548
549 auto* mem = llama_get_memory(ctx_);
550 llama_memory_clear(mem, true);
551
552 for (int i = 0; i < n_tokens; i++) {
553 llama_token tok = tokens[i];
554 llama_batch batch = llama_batch_get_one(&tok, 1);
555 int rc = llama_decode(ctx_, batch);
556 if (rc != 0) {
557 llama_memory_clear(mem, true);
558 throw std::runtime_error("llama_decode failed at logprob pos");
559 }
560 if (i < n_tokens - 1) {
561 const float* logits = llama_get_logits_ith(ctx_, -1);
562 float lp = extract_token_logprob(
563 logits, tokens[i + 1], n_vocab);
564 result.logprobs.push_back(lp);
565 }
566 }
567
568 float sum = 0.0f;
569 for (float lp : result.logprobs) { sum += lp; }
570 result.total_logprob = sum;
571 result.perplexity = std::exp(
572 -sum / static_cast<float>(result.n_logprobs));
573
574 llama_memory_clear(mem, true);
575 return result;
576}
577
585 std::lock_guard<std::mutex> lock(seq_id_mutex_);
586 if (!free_seq_ids_.empty()) {
587 auto id = free_seq_ids_.back();
588 free_seq_ids_.pop_back();
589 return id;
590 }
591 return static_cast<llama_seq_id>(1 + free_seq_ids_.size());
592}
593
600void LlamaCppBackend::release_temp_seq_id(llama_seq_id seq_id) {
601 std::lock_guard<std::mutex> lock(seq_id_mutex_);
602 free_seq_ids_.push_back(seq_id);
603}
604
619 const float* logits,
620 int32_t next_token,
621 int n_vocab)
622{
623 float max_logit = logits[0];
624 for (int v = 1; v < n_vocab; v++) {
625 if (logits[v] > max_logit) {
626 max_logit = logits[v];
627 }
628 }
629 float sum_exp = 0.0f;
630 for (int v = 0; v < n_vocab; v++) {
631 sum_exp += std::exp(logits[v] - max_logit);
632 }
633 float log_sum_exp = max_logit + std::log(sum_exp);
634 return logits[next_token] - log_sum_exp;
635}
636
637// ── Chat template ──────────────────────────────────────────
638
659static std::vector<llama_chat_message> to_llama_chat(
660 const std::vector<Message>& messages) {
661 std::vector<llama_chat_message> chat_msgs;
662 chat_msgs.reserve(messages.size());
663 for (const auto& msg : messages) {
664 chat_msgs.push_back({msg.role.c_str(), msg.content.c_str()});
665 }
666 return chat_msgs;
667}
668
676static std::string concat_messages_fallback(
677 const std::vector<Message>& messages) {
678 std::string fallback;
679 for (const auto& msg : messages) {
680 fallback += msg.role + ": " + msg.content + "\n";
681 }
682 return fallback;
683}
684
691 const std::vector<Message>& messages,
692 const GenerationParams& params) const
693{
694 auto chat_msgs = to_llama_chat(messages);
695
696 // First call: measure required buffer size
697 int n = llama_chat_apply_template(
698 nullptr, chat_msgs.data(), chat_msgs.size(),
699 true, nullptr, 0);
700 if (n < 0) {
701 logger->error("llama_chat_apply_template failed (size query)");
702 return concat_messages_fallback(messages);
703 }
704
705 std::vector<char> buf(static_cast<size_t>(n + 1));
706 int written = llama_chat_apply_template(
707 nullptr, chat_msgs.data(), chat_msgs.size(),
708 true, buf.data(), static_cast<int32_t>(buf.size()));
709 if (written < 0) {
710 logger->error("llama_chat_apply_template failed (render)");
711 return "";
712 }
713
714 return std::string(buf.data(), static_cast<size_t>(written));
715}
716
717// ── Sampler ────────────────────────────────────────────────
718
733 const GenerationParams& params) const
734{
735 llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
736 llama_sampler* chain = llama_sampler_chain_init(chain_params);
737
738 // Grammar constraint (applied first)
739 if (!params.grammar.empty()) {
740 llama_sampler* grammar = llama_sampler_init_grammar(
741 vocab_, params.grammar.c_str(), "root");
742 if (grammar) {
743 llama_sampler_chain_add(chain, grammar);
744 }
745 }
746
747 // Repetition penalty
748 if (params.repeat_penalty != 1.0f) {
749 llama_sampler_chain_add(chain,
750 llama_sampler_init_penalties(
751 64, params.repeat_penalty, 0.0f, 0.0f));
752 }
753
754 // Temperature
755 if (params.temperature > 0.0f) {
756 llama_sampler_chain_add(chain,
757 llama_sampler_init_temp(params.temperature));
758 }
759
760 // Top-K
761 if (params.top_k > 0) {
762 llama_sampler_chain_add(chain,
763 llama_sampler_init_top_k(params.top_k));
764 }
765
766 // Top-P (nucleus sampling)
767 if (params.top_p < 1.0f) {
768 llama_sampler_chain_add(chain,
769 llama_sampler_init_top_p(params.top_p, 1));
770 }
771
772 // Final distribution sampler — use caller seed or LLAMA_DEFAULT_SEED
773 uint32_t seed = params.seed < 0
774 ? LLAMA_DEFAULT_SEED
775 : static_cast<uint32_t>(params.seed);
776 llama_sampler_chain_add(chain, llama_sampler_init_dist(seed));
777
778 return chain;
779}
780
781// ── Decode loop ────────────────────────────────────────────
782
790bool LlamaCppBackend::run_prefill(const std::vector<llama_token>& tokens) {
791 llama_memory_clear(llama_get_memory(ctx_), true);
792
793 const int n_batch = config().n_batch;
794 const int n_tokens = static_cast<int>(tokens.size());
795
796 for (int i = 0; i < n_tokens; i += n_batch) {
797 int chunk = std::min(n_batch, n_tokens - i);
798 std::vector<llama_token> slice(
799 tokens.begin() + i, tokens.begin() + i + chunk);
800 llama_batch batch = llama_batch_get_one(
801 slice.data(), static_cast<int32_t>(chunk));
802 if (llama_decode(ctx_, batch) != 0) {
803 logger->error("Prefill decode failed at offset {}", i);
804 return false;
805 }
806 }
807 return true;
808}
809
821 llama_sampler* sampler,
822 std::string& generated,
823 std::function<void(std::string_view)>& on_token,
824 const std::vector<std::string>& stop)
825{
826 llama_token new_token = llama_sampler_sample(sampler, ctx_, -1);
827
828 if (new_token == llama_vocab_eos(vocab_)
829 || llama_vocab_is_eog(vocab_, new_token)) {
830 return "eos";
831 }
832
833 std::string piece = detokenize(new_token);
834 generated += piece;
835 if (on_token) {
836 on_token(std::string_view(piece));
837 }
838 if (check_stop_sequences(generated, stop)) {
839 return "stop";
840 }
841
842 llama_token tok = new_token;
843 llama_batch single = llama_batch_get_one(&tok, 1);
844 return (llama_decode(ctx_, single) == 0) ? "continue" : "error";
845}
846
858 const std::vector<llama_token>& tokens,
859 const GenerationParams& params,
860 std::function<void(std::string_view)> on_token,
861 std::atomic<bool>* cancel)
862{
863 GenerationResult result;
864 llama_sampler* sampler = create_sampler(params);
865
866 if (!run_prefill(tokens)) {
868 result.error_message = "Prefill decode failed";
869 result.finish_reason = "error";
870 llama_sampler_free(sampler);
871 return result;
872 }
873
874 std::string generated;
875 int n_generated = 0;
876
877 while (n_generated < params.max_tokens) {
878 bool cancelled = cancel && cancel->load(std::memory_order_acquire);
879 if (cancelled) {
880 result.finish_reason = "cancelled";
882 break;
883 }
884
885 auto status = step_token(sampler, generated, on_token, params.stop);
886 if (status == "continue") {
887 ++n_generated;
888 } else {
889 result.finish_reason = (status == "error") ? "error" : "stop";
890 if (status == "error") {
892 }
893 break;
894 }
895 }
896
897 if (n_generated >= params.max_tokens && result.finish_reason.empty()) {
898 result.finish_reason = "length";
899 }
900
901 llama_sampler_free(sampler);
902 result.content = generated;
903 result.token_count = n_generated;
904 return result;
905}
906
907// ── Prompt cache helpers ───────────────────────────────────
908
917 const std::vector<Message>& messages)
918{
919 for (const auto& msg : messages) {
920 if (msg.role == "system") {
921 return msg.content;
922 }
923 }
924 return "";
925}
926
940 const std::vector<llama_token>& tokens, int start_offset)
941{
942 int total = static_cast<int>(tokens.size());
943 if (start_offset >= total) { return true; }
944
945 int n_batch = llama_n_batch(ctx_);
946 int n_remaining = total - start_offset;
947 for (int off = 0; off < n_remaining; off += n_batch) {
948 int chunk = std::min(n_batch, n_remaining - off);
949 llama_batch batch = llama_batch_get_one(
950 const_cast<llama_token*>(tokens.data())
951 + start_offset + off,
952 chunk);
953 if (llama_decode(ctx_, batch) != 0) {
954 logger->error("Decode chunk failed (start={}, off={}, "
955 "chunk={})", start_offset, off, chunk);
956 return false;
957 }
958 }
959 return true;
960}
961
979 const CacheEntry* cached,
980 const std::vector<llama_token>& tokens)
981{
982 auto* mem = llama_get_memory(ctx_);
983 llama_memory_clear(mem, true);
984
985 size_t restored = llama_state_seq_set_data(
986 ctx_, cached->data.data(), cached->data_size, 0);
987 if (restored == 0) {
988 logger->warn("KV state restore failed, falling back to full prefill");
989 return false;
990 }
991
992 return decode_tokens_from(tokens, cached->token_count);
993}
994
1008 const CacheKey& key, int prefix_tokens)
1009{
1010 size_t state_size = llama_state_seq_get_size(ctx_, 0);
1011 if (state_size == 0) {
1012 return;
1013 }
1014
1015 std::vector<uint8_t> buf(state_size);
1016 size_t written = llama_state_seq_get_data(
1017 ctx_, buf.data(), buf.size(), 0);
1018 if (written > 0) {
1019 buf.resize(written);
1020 prompt_cache_->store(key, std::move(buf), prefix_tokens);
1021 }
1022}
1023
1033 const std::vector<Message>& messages,
1034 const GenerationParams& params)
1035{
1036 std::vector<Message> sys_msgs;
1037 for (const auto& msg : messages) {
1038 if (msg.role == "system") {
1039 sys_msgs.push_back(msg);
1040 }
1041 }
1042 if (sys_msgs.empty()) {
1043 return 0;
1044 }
1045
1046 std::string sys_prompt = apply_chat_template(sys_msgs, params);
1047 auto sys_tokens = tokenize(sys_prompt, true);
1048 return static_cast<int>(sys_tokens.size());
1049}
1050
1073 const std::vector<llama_token>& tokens,
1074 int prefix_tokens,
1075 const CacheKey& key)
1076{
1077 int total = static_cast<int>(tokens.size());
1078 if (prefix_tokens <= 0 || prefix_tokens >= total) {
1079 return run_prefill(tokens);
1080 }
1081
1082 // Pass 1: prefill only the prefix — `run_prefill` calls
1083 // llama_memory_clear, so seq 0 ends up holding exactly
1084 // prefix_tokens positions.
1085 std::vector<llama_token> prefix(
1086 tokens.begin(), tokens.begin() + prefix_tokens);
1087 if (!run_prefill(prefix)) {
1088 return false;
1089 }
1090
1091 // Save now: state contains exactly the prefix.
1092 save_prefix_to_cache(key, prefix_tokens);
1093
1094 // Pass 2: continue prefilling the remainder. No clear — decode
1095 // appends after the saved prefix positions.
1096 return decode_tokens_from(tokens, prefix_tokens);
1097}
1098
1115 const std::vector<llama_token>& tokens,
1116 const std::string& system_prompt,
1117 const std::vector<Message>& messages,
1118 const GenerationParams& params)
1119{
1120 bool cache_enabled = prompt_cache_
1122 && !system_prompt.empty();
1123
1124 if (!cache_enabled) {
1125 return run_prefill(tokens);
1126 }
1127
1129 system_prompt, config().path.string());
1130 const CacheEntry* cached = prompt_cache_->lookup(key);
1131
1132 if (cached != nullptr) {
1134 logger->info("Prompt cache HIT: {} bytes, {} prefix tokens",
1135 cached->data_size, cached->token_count);
1136 }
1137 if (restore_cached_prefix(cached, tokens)) {
1138 return true;
1139 }
1140 logger->warn("Cache restore failed, falling back to full prefill");
1141 } else if (prompt_cache_config_.log_hits) {
1142 logger->info("Prompt cache MISS: processing full prompt");
1143 }
1144
1145 int prefix_tokens = compute_prefix_token_count(messages, params);
1146 return prefill_and_cache_prefix(tokens, prefix_tokens, key);
1147}
1148
1149// ── Multimodal generation (v1.9.11 Phases 5–7 + v2.1.8) ────
1150
1151namespace {
1152
1158bool any_image_in(const std::vector<Message>& messages) {
1159 for (const auto& m : messages) {
1160 if (has_images(m.content_parts)) { return true; }
1161 }
1162 return false;
1163}
1164
1177std::vector<Message> strip_image_parts(
1178 const std::vector<Message>& messages) {
1179 std::vector<Message> out = messages;
1180 for (auto& m : out) {
1181 if (m.content_parts.empty()) { continue; }
1182 m.content = extract_text(m.content_parts);
1183 m.content_parts.clear();
1184 }
1185 return out;
1186}
1187
1205std::vector<Message> substitute_image_markers(
1206 const std::vector<Message>& messages,
1207 ::mtmd_context* ctx,
1208 std::vector<::mtmd_bitmap*>& bitmaps_out) {
1209 std::vector<Message> out;
1210 out.reserve(messages.size());
1211 const std::string marker = mtmd_default_marker();
1212 for (const auto& m : messages) {
1213 Message copy;
1214 copy.role = m.role;
1215 if (m.content_parts.empty()) {
1216 copy.content = m.content;
1217 out.push_back(std::move(copy));
1218 continue;
1219 }
1220 std::string built;
1221 for (const auto& p : m.content_parts) {
1222 if (p.type != ContentPartType::IMAGE) {
1223 built += p.text;
1224 continue;
1225 }
1226 ::mtmd_bitmap* bm = nullptr;
1227 if (!p.image_path.empty()) {
1228 bm = mtmd_helper_bitmap_init_from_file(
1229 ctx, p.image_path.c_str());
1230 }
1231 if (bm == nullptr) { return {}; }
1232 bitmaps_out.push_back(bm);
1233 built += marker;
1234 }
1235 copy.content = std::move(built);
1236 out.push_back(std::move(copy));
1237 }
1238 return out;
1239}
1240
1241} // anonymous namespace
1242
1254 const std::string& prompt,
1255 const std::vector<::mtmd_bitmap*>& bitmaps,
1256 std::string& err_msg)
1257{
1258 llama_memory_clear(llama_get_memory(ctx_), true);
1259 ::mtmd_input_text mt{prompt.c_str(), true, true};
1260 auto* chunks = mtmd_input_chunks_init();
1261 std::vector<const ::mtmd_bitmap*> bm_cptrs(
1262 bitmaps.begin(), bitmaps.end());
1263 int32_t tok_rc = mtmd_tokenize(
1264 mtmd_ctx_, chunks, &mt, bm_cptrs.data(), bm_cptrs.size());
1265 if (tok_rc != 0) {
1266 mtmd_input_chunks_free(chunks);
1267 err_msg = "mtmd_tokenize failed (rc="
1268 + std::to_string(tok_rc) + ")";
1270 }
1271 llama_pos new_n_past = 0;
1272 int32_t eval_rc = mtmd_helper_eval_chunks(
1273 mtmd_ctx_, ctx_, chunks, 0, 0,
1274 static_cast<int32_t>(config().n_batch),
1275 true, &new_n_past);
1276 mtmd_input_chunks_free(chunks);
1277 if (eval_rc != 0) {
1278 err_msg = "mtmd_helper_eval_chunks failed (rc="
1279 + std::to_string(eval_rc) + ")";
1281 }
1282 logger->info("Multimodal prefill complete: n_past={}", new_n_past);
1283 return ENTROPIC_OK;
1284}
1285
1297 const GenerationParams& params,
1298 std::function<void(std::string_view token)> on_token,
1299 std::atomic<bool>* cancel,
1300 const std::chrono::steady_clock::time_point& t0)
1301{
1302 GenerationResult result;
1303 auto* sampler = create_sampler(params);
1304 std::string generated;
1305 int n_generated = 0;
1306 while (n_generated < params.max_tokens) {
1307 if (cancel != nullptr
1308 && cancel->load(std::memory_order_acquire)) {
1309 result.finish_reason = "cancelled";
1311 break;
1312 }
1313 auto status = step_token(
1314 sampler, generated, on_token, params.stop);
1315 if (status == "continue") { ++n_generated; continue; }
1316 result.finish_reason = (status == "error") ? "error" : "stop";
1317 if (status == "error") {
1319 }
1320 break;
1321 }
1322 if (n_generated >= params.max_tokens
1323 && result.finish_reason.empty()) {
1324 result.finish_reason = "length";
1325 }
1326 llama_sampler_free(sampler);
1327 result.content = generated;
1328 result.token_count = n_generated;
1329 finalize_result(result, t0);
1330 return result;
1331}
1332
1339 const std::vector<Message>& messages,
1340 const GenerationParams& params,
1341 std::function<void(std::string_view token)> on_token,
1342 std::atomic<bool>* cancel)
1343{
1344 auto t0 = entropic::log::now();
1345 std::vector<::mtmd_bitmap*> bitmaps;
1346 auto marked = substitute_image_markers(
1347 messages, mtmd_ctx_, bitmaps);
1348 if (marked.empty()) {
1349 for (auto* b : bitmaps) { mtmd_bitmap_free(b); }
1350 GenerationResult err;
1352 err.error_message =
1353 "mtmd_helper_bitmap_init_from_file failed";
1354 return err;
1355 }
1356 auto prompt = apply_chat_template(marked, params);
1357 logger->info("Multimodal generate: {} images, prompt={} chars, max_tokens={}",
1358 bitmaps.size(), prompt.size(), params.max_tokens);
1359 std::string prefill_err;
1360 auto rc = mtmd_prefill(prompt, bitmaps, prefill_err);
1361 for (auto* b : bitmaps) { mtmd_bitmap_free(b); }
1362 if (rc != ENTROPIC_OK) {
1363 GenerationResult err;
1364 err.error_code = rc;
1365 err.error_message = std::move(prefill_err);
1366 return err;
1367 }
1368 return run_sampling_loop(params, on_token, cancel, t0);
1369}
1370
1371// ── Generation entry points ────────────────────────────────
1372
1389 const std::vector<Message>& messages,
1390 const GenerationParams& params)
1391{
1392 if (!any_image_in(messages)) {
1393 return do_generate_text_only(messages, params);
1394 }
1395 if (has_vision_ && mtmd_ctx_ != nullptr) {
1396 return generate_multimodal(messages, params, nullptr, nullptr);
1397 }
1398 logger->warn("Image content present but model has no vision "
1399 "capability — stripping image parts");
1400 return do_generate_text_only(strip_image_parts(messages), params);
1401}
1402
1409 const std::vector<Message>& messages,
1410 const GenerationParams& params)
1411{
1412 auto t0 = entropic::log::now();
1413 std::string prompt = apply_chat_template(messages, params);
1414 auto tokens = tokenize(prompt, true);
1415 std::string sys = extract_system_prompt(messages);
1416
1417 logger->info("Generate: {} input tokens, max_tokens={}",
1418 tokens.size(), params.max_tokens);
1419 log_sampler_config(params);
1420
1421 GenerationResult result;
1422 llama_sampler* sampler = create_sampler(params);
1423
1424 if (!run_prefill_cached(tokens, sys, messages, params)) {
1425 llama_sampler_free(sampler);
1426 return prefill_error();
1427 }
1428
1429 std::string generated;
1430 int n_generated = 0;
1431 std::function<void(std::string_view)> no_cb = nullptr;
1432
1433 while (n_generated < params.max_tokens) {
1434 auto status = step_token(
1435 sampler, generated, no_cb, params.stop);
1436 if (status == "continue") { ++n_generated; }
1437 else {
1438 result.finish_reason =
1439 (status == "error") ? "error" : "stop";
1440 if (status == "error") {
1442 }
1443 break;
1444 }
1445 }
1446
1447 if (n_generated >= params.max_tokens
1448 && result.finish_reason.empty()) {
1449 result.finish_reason = "length";
1450 }
1451
1452 llama_sampler_free(sampler);
1453 result.content = generated;
1454 result.token_count = n_generated;
1455 finalize_result(result, t0);
1456 return result;
1457}
1458
1470 const std::vector<Message>& messages,
1471 const GenerationParams& params,
1472 std::function<void(std::string_view token)> on_token,
1473 std::atomic<bool>& cancel)
1474{
1475 if (!any_image_in(messages)) {
1477 messages, params, on_token, cancel);
1478 }
1479 if (has_vision_ && mtmd_ctx_ != nullptr) {
1480 return generate_multimodal(messages, params, on_token, &cancel);
1481 }
1482 logger->warn("Image content present but model has no vision "
1483 "capability — stripping image parts");
1485 strip_image_parts(messages), params, on_token, cancel);
1486}
1487
1494 const std::vector<Message>& messages,
1495 const GenerationParams& params,
1496 std::function<void(std::string_view token)> on_token,
1497 std::atomic<bool>& cancel)
1498{
1499 auto t0 = entropic::log::now();
1500 auto prompt = apply_chat_template(messages, params);
1501 auto tokens = tokenize(prompt, true);
1502 auto sys = extract_system_prompt(messages);
1503 logger->info("Stream: {} input tokens, max_tokens={}",
1504 tokens.size(), params.max_tokens);
1505 log_sampler_config(params);
1506
1507 GenerationResult result;
1508 auto* sampler = create_sampler(params);
1509 if (!run_prefill_cached(tokens, sys, messages, params)) {
1510 llama_sampler_free(sampler);
1511 return prefill_error();
1512 }
1513 std::string generated;
1514 int n_generated = 0;
1515 while (n_generated < params.max_tokens) {
1516 if (cancel.load(std::memory_order_acquire)) {
1517 result.finish_reason = "cancelled";
1519 break;
1520 }
1521 auto status = step_token(
1522 sampler, generated, on_token, params.stop);
1523 if (status == "continue") { ++n_generated; }
1524 else {
1525 result.finish_reason =
1526 (status == "error") ? "error" : "stop";
1527 if (status == "error") {
1529 }
1530 break;
1531 }
1532 }
1533 if (n_generated >= params.max_tokens
1534 && result.finish_reason.empty()) {
1535 result.finish_reason = "length";
1536 }
1537 llama_sampler_free(sampler);
1538 result.content = generated;
1539 result.token_count = n_generated;
1540 finalize_result(result, t0);
1541 return result;
1542}
1543
1557 const std::vector<Message>& /*messages*/,
1558 const GenerationParams& /*params*/,
1559 std::function<void(std::string_view)> /*on_token*/,
1560 std::atomic<bool>& /*cancel*/)
1561{
1562 GenerationResult result;
1564 result.error_message =
1565 "LlamaCppBackend speculative requires an explicit draft "
1566 "backend handle — orchestrator dispatches via "
1567 "generate_speculative_with_draft";
1568 result.finish_reason = "error";
1569 return result;
1570}
1571
1572namespace {
1573
1586common_params_sampling to_common_sampling(
1587 const GenerationParams& params) {
1588 common_params_sampling cps;
1589 cps.temp = params.temperature;
1590 cps.top_k = params.top_k;
1591 cps.top_p = params.top_p;
1592 cps.penalty_repeat = params.repeat_penalty;
1593 if (params.seed >= 0) {
1594 cps.seed = static_cast<uint32_t>(params.seed);
1595 }
1596 cps.no_perf = true;
1597 // Mirror entropic's standard sampler chain ordering so the
1598 // speculative path produces output bit-identical to plain decode
1599 // (the v2.1.11 correctness contract). Entropic's `create_sampler`
1600 // builds: penalties → top_k → top_p → temperature → dist, AND
1601 // SKIPS the temperature sampler when temp == 0 (greedy mode).
1602 // common_sampler appends an extended-temperature sampler that
1603 // differs subtly from "no temp at all" — we omit it for temp=0
1604 // to match entropic exactly. Also strip the additional filters
1605 // (min_p, top_n_sigma, dry, xtc, typical_p) in the default chain.
1606 cps.samplers = {COMMON_SAMPLER_TYPE_PENALTIES,
1607 COMMON_SAMPLER_TYPE_TOP_K,
1608 COMMON_SAMPLER_TYPE_TOP_P};
1609 if (params.temperature > 0.0f) {
1610 cps.samplers.push_back(COMMON_SAMPLER_TYPE_TEMPERATURE);
1611 }
1612 cps.min_p = 0.0f;
1613 cps.dry_multiplier = 0.0f;
1614 cps.top_n_sigma = -1.0f;
1615 return cps;
1616}
1617
1634bool spec_prefill_minus_last(
1635 llama_context* ctx, const std::vector<llama_token>& tokens) {
1636 int total = static_cast<int>(tokens.size()) - 1;
1637 if (total <= 0) { return true; }
1638 int n_batch = llama_n_batch(ctx);
1639 for (int off = 0; off < total; off += n_batch) {
1640 int chunk = std::min(n_batch, total - off);
1641 llama_batch batch = llama_batch_get_one(
1642 const_cast<llama_token*>(tokens.data()) + off, chunk);
1643 if (llama_decode(ctx, batch) != 0) { return false; }
1644 }
1645 return true;
1646}
1647
1653GenerationResult spec_error(entropic_error_t code, std::string msg) {
1654 GenerationResult r;
1655 r.error_code = code;
1656 r.error_message = std::move(msg);
1657 r.finish_reason = "error";
1658 return r;
1659}
1660
1661} // anonymous namespace
1662
1671 common_speculative* spec = nullptr;
1672 common_sampler* smpl = nullptr;
1673 llama_context* ctx_tgt = nullptr;
1674 llama_context* ctx_dft = nullptr;
1675 llama_batch batch_tgt{};
1676 bool batch_initialized = false;
1677 llama_seq_id seq_id = 0;
1678 int n_past = 0;
1679 llama_token id_last = 0;
1680 std::vector<llama_token> prompt_tgt;
1681 std::vector<llama_token> draft;
1682 std::string generated;
1683 int n_generated = 0;
1684 int n_drafted = 0;
1685 int n_accepted = 0;
1686 bool has_eos = false;
1687 std::string finish_reason;
1688 entropic_error_t error_code = ENTROPIC_OK;
1689 std::string error_message;
1690
1691 // ── Checkpoint state (v2.1.11) ──────────────────────────
1692 // Activated when either context reports FULL-only seq_rm
1693 // (no partial removal). The kernel saves+restores draft/target
1694 // state across each speculative round so the underlying
1695 // memory module never sees an attempted partial removal.
1696 // Mirrors the use_ckpt_tgt / use_ckpt_dft flow in upstream's
1697 // speculative-simple example.
1698 bool use_ckpt_tgt = false;
1699 bool use_ckpt_dft = false;
1700 common_prompt_checkpoint ckpt;
1701};
1702
1710 if (state.spec) { common_speculative_free(state.spec); }
1711 if (state.smpl) { common_sampler_free(state.smpl); }
1712 if (state.batch_initialized) {
1713 llama_batch_free(state.batch_tgt);
1714 }
1715}
1716
1724 common_batch_clear(state.batch_tgt);
1725 common_batch_add(state.batch_tgt, state.id_last,
1726 state.n_past, {state.seq_id}, true);
1727 int pos = state.n_past + 1;
1728 for (auto draft_token : state.draft) {
1729 common_batch_add(state.batch_tgt, draft_token, pos,
1730 {state.seq_id}, true);
1731 ++pos;
1732 }
1733}
1734
1743 spec_build_batch(state);
1744 int rc_tgt = llama_decode(state.ctx_tgt, state.batch_tgt);
1745 if (rc_tgt != 0) {
1746 logger->error("Speculative target decode failed: rc={}, "
1747 "n_past={}, draft_size={}",
1748 rc_tgt, state.n_past, state.draft.size());
1749 state.error_code = ENTROPIC_ERROR_GENERATE_FAILED;
1750 state.error_message = "target llama_decode failed";
1751 state.finish_reason = "error";
1752 return false;
1753 }
1754 int rc_dft = llama_decode(state.ctx_dft, state.batch_tgt);
1755 if (rc_dft != 0) {
1756 logger->error("Speculative draft decode failed: rc={}, "
1757 "n_past={}, draft_size={}",
1758 rc_dft, state.n_past, state.draft.size());
1759 state.error_code = ENTROPIC_ERROR_GENERATE_FAILED;
1760 state.error_message = "draft llama_decode failed";
1761 state.finish_reason = "error";
1762 return false;
1763 }
1764 return true;
1765}
1766
1774 auto& dp = common_speculative_get_draft_params(
1775 state.spec, state.seq_id);
1776 dp.drafting = true;
1777 dp.n_max = -1;
1778 dp.n_past = state.n_past;
1779 dp.id_last = state.id_last;
1780 dp.prompt = &state.prompt_tgt;
1781 dp.result = &state.draft;
1782 common_speculative_draft(state.spec);
1783 return static_cast<int>(state.draft.size());
1784}
1785
1799static std::string spec_emit_token(
1800 SpeculativeRunState& state, llama_token id,
1801 const llama_vocab* vocab, int max_tokens,
1802 std::function<void(std::string_view)>& on_token,
1803 std::atomic<bool>& cancel)
1804{
1805 std::string signal;
1806 state.prompt_tgt.push_back(state.id_last);
1807 state.id_last = id;
1808 state.n_generated++;
1809 if (llama_vocab_is_eog(vocab, id)) {
1810 state.has_eos = true;
1811 state.finish_reason = "stop";
1812 signal = "eos";
1813 } else {
1814 const std::string piece =
1815 common_token_to_piece(state.ctx_tgt, id);
1816 state.generated += piece;
1817 if (on_token) { on_token(piece); }
1818 if (cancel.load(std::memory_order_acquire)) {
1819 state.error_code = ENTROPIC_ERROR_CANCELLED;
1820 state.finish_reason = "cancelled";
1821 signal = "cancel";
1822 } else if (state.n_generated >= max_tokens) {
1823 state.finish_reason = "length";
1824 signal = "length";
1825 }
1826 }
1827 return signal;
1828}
1829
1842 state.ckpt.update_pos(
1843 static_cast<int64_t>(state.prompt_tgt.size()),
1844 llama_memory_seq_pos_min(
1845 llama_get_memory(state.ctx_tgt), state.seq_id),
1846 llama_memory_seq_pos_max(
1847 llama_get_memory(state.ctx_tgt), state.seq_id));
1848 if (state.use_ckpt_dft) {
1849 state.ckpt.update_dft(state.ctx_dft, state.seq_id,
1850 LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1851 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
1852 }
1853}
1854
1862 if (state.use_ckpt_tgt && !state.draft.empty()) {
1863 state.ckpt.update_tgt(state.ctx_tgt, state.seq_id,
1864 LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1865 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
1866 }
1867}
1868
1876 constexpr auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1877 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE;
1878 if (state.use_ckpt_dft) {
1879 state.ckpt.load_dft(state.ctx_dft, state.seq_id, flags);
1880 }
1881 llama_memory_seq_rm(llama_get_memory(state.ctx_dft),
1882 state.seq_id, state.ckpt.pos_max + 1, -1);
1883}
1884
1895 SpeculativeRunState& state, common_sampler* smpl_save,
1896 std::vector<llama_token>& ids) {
1897 constexpr auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY
1898 | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE;
1899 state.draft = std::move(ids);
1900 state.ckpt.load_tgt(state.ctx_tgt, state.seq_id, flags);
1901 llama_memory_seq_rm(llama_get_memory(state.ctx_tgt),
1902 state.seq_id, state.ckpt.pos_max + 1, -1);
1903 state.ckpt.load_dft(state.ctx_dft, state.seq_id, flags);
1904 llama_memory_seq_rm(llama_get_memory(state.ctx_dft),
1905 state.seq_id, state.ckpt.pos_max + 1, -1);
1906 state.prompt_tgt.resize(static_cast<size_t>(state.ckpt.n_tokens));
1907 state.n_past = static_cast<int>(state.prompt_tgt.size());
1908 // Sampler clone is non-null only when use_ckpt_tgt is set
1909 common_sampler_free(state.smpl);
1910 state.smpl = smpl_save;
1911}
1912
1938 llama_memory_seq_rm(llama_get_memory(state.ctx_tgt),
1939 state.seq_id, state.n_past, -1);
1940 llama_memory_seq_rm(llama_get_memory(state.ctx_dft),
1941 state.seq_id, state.n_past, -1);
1942}
1943
1951 SpeculativeRunState& state,
1952 const std::vector<llama_token>& ids,
1953 const llama_vocab* vocab, int max_tokens,
1954 std::function<void(std::string_view)>& on_token,
1955 std::atomic<bool>& cancel) {
1956 bool stop = false;
1957 for (auto id : ids) {
1958 auto signal = spec_emit_token(
1959 state, id, vocab, max_tokens, on_token, cancel);
1960 if (!signal.empty()) { stop = true; break; }
1961 }
1962 return stop;
1963}
1964
1986 // Skip drafting if the previous round restored a partial accept
1987 // into state.draft (carry-over from rollback).
1988 if (!state.draft.empty()) {
1989 return static_cast<int>(state.draft.size());
1990 }
1991 spec_ckpt_save_dft(state);
1992 int drafted = spec_run_draft(state);
1993 spec_ckpt_save_tgt(state);
1994 spec_ckpt_restore_dft(state);
1995 return drafted;
1996}
1997
2004 SpeculativeRunState& state,
2005 const llama_vocab* vocab,
2006 int max_tokens,
2007 std::function<void(std::string_view)>& on_token,
2008 std::atomic<bool>& cancel)
2009{
2010 int draft_size_before = spec_prepare_draft(state);
2011
2012 if (!spec_decode_both(state)) { return false; }
2013
2014 common_sampler* smpl_save = nullptr;
2015 if (state.use_ckpt_tgt) {
2016 smpl_save = common_sampler_clone(state.smpl);
2017 }
2018 auto ids = common_sampler_sample_and_accept_n(
2019 state.smpl, state.ctx_tgt, state.draft);
2020 int accepted = static_cast<int>(ids.size()) - 1;
2021 if (accepted < 0) { accepted = 0; }
2022
2023 // Partial acceptance on a FULL-seq_rm context: rollback to
2024 // checkpoint, set draft = accepted, re-loop without emitting.
2025 if (state.use_ckpt_tgt
2026 && static_cast<int>(ids.size()) - 1
2027 < static_cast<int>(state.draft.size())) {
2028 spec_rollback_partial(state, smpl_save, ids);
2029 return true;
2030 }
2031 if (smpl_save) { common_sampler_free(smpl_save); }
2032
2033 common_speculative_accept(state.spec, state.seq_id, accepted);
2034 state.n_drafted += draft_size_before;
2035 state.n_accepted += accepted;
2036 // n_past advances by ids.size() total: one slot for id_last
2037 // (the post-id_last position the next id will occupy), plus
2038 // `accepted` slots for the drafted tokens the sampler agreed
2039 // with. Matches speculative-simple's n_past++ in batch_add +
2040 // n_past += ids.size() - 1 sequence.
2041 state.n_past += static_cast<int>(ids.size());
2042
2043 bool stop = spec_commit_accepted(
2044 state, ids, vocab, max_tokens, on_token, cancel);
2045 state.draft.clear();
2047 return !stop;
2048}
2049
2062static std::string spec_check_preconditions(
2063 bool target_active, bool draft_active,
2064 llama_context* ctx_tgt, llama_context* ctx_dft) {
2065 // Defense-in-depth arch gate — orchestrator's
2066 // check_speculative_compat is the primary gate; a direct caller
2067 // into the kernel must also be refused on recurrent / hybrid
2068 // targets (Session 5 Gate A: hybrid SSM state diverges across
2069 // split-prefill boundaries; bit-identical unreachable at this pin).
2070 std::string err;
2071 const llama_model* model_tgt = llama_get_model(ctx_tgt);
2072 int cap_tgt = common_context_can_seq_rm(ctx_tgt);
2073 int cap_dft = common_context_can_seq_rm(ctx_dft);
2074 logger->info("Speculative seq_rm capability: target={}, draft={} "
2075 "(0=NO, 1=PART, 2=FULL)", cap_tgt, cap_dft);
2076 if (!target_active || !draft_active) {
2077 err = "speculative requires ACTIVE target + draft";
2078 } else if (llama_model_is_recurrent(model_tgt)
2079 || llama_model_is_hybrid(model_tgt)) {
2080 err = "speculative refused: architecture (target is "
2081 "recurrent or hybrid; see proposal Implementation "
2082 "Log Gate A)";
2083 } else if (cap_tgt == COMMON_CONTEXT_SEQ_RM_TYPE_NO
2084 || cap_dft == COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
2085 // NO is the only unsupported seq_rm case — the kernel has
2086 // both PART fast-path and FULL checkpoint paths.
2087 err = "speculative kernel requires at least FULL seq_rm "
2088 "(target/draft reported NO seq_rm at all)";
2089 }
2090 return err;
2091}
2092
2122 SpeculativeRunState& state, llama_model* model_tgt,
2123 const GenerationParams& params, int n_draft_max,
2124 const std::string& draft_path) {
2125 auto common_sampling = to_common_sampling(params);
2126 state.smpl = common_sampler_init(model_tgt, common_sampling);
2127 if (!state.smpl) { return "common_sampler_init failed"; }
2128
2129 common_params_speculative spec_params;
2130 spec_params.types = {COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE};
2131 spec_params.draft.n_max = (n_draft_max > 0) ? n_draft_max : 16;
2132 spec_params.draft.ctx_tgt = state.ctx_tgt;
2133 spec_params.draft.ctx_dft = state.ctx_dft;
2134 // Upstream gates DRAFT_SIMPLE on a non-empty draft path
2135 // (see common/speculative.cpp:875). Required even though we
2136 // provide already-loaded contexts.
2137 spec_params.draft.mparams.path = draft_path;
2138 state.spec = common_speculative_init(spec_params, 1);
2139 if (!state.spec) {
2140 common_sampler_free(state.smpl);
2141 state.smpl = nullptr;
2142 return "common_speculative_init failed";
2143 }
2144
2145 common_speculative_begin(state.spec, state.seq_id, state.prompt_tgt);
2146 state.batch_tgt = llama_batch_init(llama_n_batch(state.ctx_tgt), 0, 1);
2147 state.batch_initialized = true;
2148 // Checkpoint flow lights up when either context can only do
2149 // FULL-sequence removal. Mirrors speculative-simple's
2150 // use_ckpt_{tgt,dft}.
2151 state.use_ckpt_tgt = common_context_can_seq_rm(state.ctx_tgt)
2152 == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
2153 state.use_ckpt_dft = common_context_can_seq_rm(state.ctx_dft)
2154 == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
2155 return "";
2156}
2157
2163static std::string spec_init_run(
2164 SpeculativeRunState& state, llama_model* model_tgt,
2165 const std::vector<llama_token>& tokens,
2166 const GenerationParams& params, int n_draft_max,
2167 const std::string& draft_path) {
2168 state.id_last = tokens.back();
2169 state.prompt_tgt.assign(tokens.begin(), tokens.end() - 1);
2170 state.n_past = static_cast<int>(tokens.size()) - 1;
2171
2172 llama_memory_clear(llama_get_memory(state.ctx_tgt), true);
2173 llama_memory_clear(llama_get_memory(state.ctx_dft), true);
2174
2175 if (!spec_prefill_minus_last(state.ctx_tgt, tokens)
2176 || !spec_prefill_minus_last(state.ctx_dft, tokens)) {
2177 return "speculative prefill failed";
2178 }
2180 state, model_tgt, params, n_draft_max, draft_path);
2181}
2182
2188static void spec_run_loop(
2189 SpeculativeRunState& state, const llama_vocab* vocab,
2190 int max_tokens,
2191 std::function<void(std::string_view)>& on_token,
2192 std::atomic<bool>& cancel) {
2193 while (state.n_generated < max_tokens) {
2194 if (cancel.load(std::memory_order_acquire)) {
2195 state.error_code = ENTROPIC_ERROR_CANCELLED;
2196 state.finish_reason = "cancelled";
2197 break;
2198 }
2199 if (!spec_accept_round(state, vocab, max_tokens,
2200 on_token, cancel)) {
2201 break;
2202 }
2203 }
2204 if (state.finish_reason.empty()) {
2205 state.finish_reason = (state.n_generated >= max_tokens)
2206 ? "length" : "stop";
2207 }
2208}
2209
2222 SpeculativeRunState& state,
2223 std::chrono::steady_clock::time_point t0) {
2224 GenerationResult result;
2225 result.content = state.generated;
2226 result.token_count = state.n_generated;
2227 result.finish_reason = state.finish_reason;
2228 result.error_code = state.error_code;
2229 result.error_message = state.error_message;
2230 result.generation_time_ms =
2231 entropic::log::elapsed_ms(t0, entropic::log::now());
2232 if (state.n_drafted > 0) {
2233 const float accept_rate =
2234 static_cast<float>(state.n_accepted)
2235 / static_cast<float>(state.n_drafted);
2236 logger->info("Speculative: generated={}, drafted={}, "
2237 "accepted={}, accept_rate={:.3f}",
2238 state.n_generated, state.n_drafted,
2239 state.n_accepted, accept_rate);
2240 }
2241 spec_cleanup(state);
2242 return result;
2243}
2244
2286 llama_context* ctx_tgt, llama_context* ctx_dft, llama_model* model_tgt,
2287 const std::vector<llama_token>& tokens, const GenerationParams& params,
2288 std::function<void(std::string_view)>& on_token,
2289 std::atomic<bool>& cancel, int n_draft_max,
2290 const std::string& draft_path,
2291 std::chrono::steady_clock::time_point t0) {
2292 SpeculativeRunState state;
2293 state.ctx_tgt = ctx_tgt;
2294 state.ctx_dft = ctx_dft;
2295 auto init_err = spec_init_run(state, model_tgt, tokens, params,
2296 n_draft_max, draft_path);
2297 if (!init_err.empty()) {
2298 spec_cleanup(state);
2299 return spec_error(ENTROPIC_ERROR_GENERATE_FAILED,
2300 std::move(init_err));
2301 }
2302 spec_run_loop(state, llama_model_get_vocab(model_tgt),
2303 params.max_tokens, on_token, cancel);
2304 return spec_finalize(state, t0);
2305}
2306
2313 const std::vector<Message>& messages,
2314 const GenerationParams& params,
2315 std::function<void(std::string_view)> on_token,
2316 std::atomic<bool>& cancel,
2317 LlamaCppBackend& draft,
2318 int n_draft_max,
2319 const std::string& draft_path)
2320{
2321 auto t0 = entropic::log::now();
2322 auto pre_err = spec_check_preconditions(
2323 is_active(), draft.is_active(), ctx_, draft.ctx_);
2324 GenerationResult result;
2325 if (!pre_err.empty()) {
2326 entropic_error_t code =
2327 (pre_err.find("requires ACTIVE") != std::string::npos)
2330 result = spec_error(code, std::move(pre_err));
2331 } else {
2332 auto prompt = apply_chat_template(messages, params);
2333 auto tokens = tokenize(prompt, true);
2334 if (tokens.size() < 2) {
2335 result = spec_error(ENTROPIC_ERROR_GENERATE_FAILED,
2336 "speculative prompt must have at least 2 tokens");
2337 } else {
2338 logger->info("Speculative: {} input tokens, max_tokens={}, "
2339 "n_draft_max={}",
2340 tokens.size(), params.max_tokens, n_draft_max);
2341 result = spec_run_from_tokens(
2342 ctx_, draft.ctx_, model_, tokens, params, on_token,
2343 cancel, n_draft_max, draft_path, t0);
2344 }
2345 }
2346 return result;
2347}
2348
2358 const std::string& prompt,
2359 const GenerationParams& params)
2360{
2361 auto t0 = entropic::log::now();
2362 auto tokens = tokenize(prompt, false);
2363
2364 logger->info("Complete: {} input tokens, max_tokens={}",
2365 tokens.size(), params.max_tokens);
2366 log_sampler_config(params);
2367 auto result = decode_loop(tokens, params, nullptr, nullptr);
2368 finalize_result(result, t0);
2369 return result;
2370}
2371
2372// ── Architecture detection (v1.9.13) ───────────────────────
2373
2381 return is_recurrent_;
2382}
2383
2384// ── Capability overrides (v1.9.13) ─────────────────────────
2385
2394 int idx = static_cast<int>(cap);
2395 int count = static_cast<int>(BackendCapability::_COUNT);
2396 if (idx < 0 || idx >= count) {
2397 return false;
2398 }
2399
2400 // Static capabilities: true = always supported. Length must equal
2401 // BackendCapability::_COUNT — trailing entries get appended as new
2402 // capabilities are introduced (gh#53 added AUDIO at index 12).
2403 static constexpr bool always[] = {
2404 false, false, true, true, true, true,
2405 false, true, true, false, false, true,
2406 false, // AUDIO — dynamic only (mtmd_support_audio)
2407 };
2408
2409 // Dynamic capabilities override the static table
2410 bool result = always[idx];
2411 if (!result) {
2412 result = (cap == BackendCapability::KV_CACHE && !is_recurrent())
2414 || (cap == BackendCapability::VISION
2415 && !config().mmproj_path.empty())
2416 || (cap == BackendCapability::AUDIO
2417 && mtmd_ctx_ != nullptr
2418 && mtmd_support_audio(mtmd_ctx_))
2420 && !is_recurrent());
2421 }
2422 return result;
2423}
2424
2432 return "llama.cpp";
2433}
2434
2442 BackendInfo bi;
2443 bi.name = "llama.cpp";
2444#if defined(ENTROPIC_BACKEND_CUDA)
2445 bi.compute_device = "cuda";
2446#elif defined(ENTROPIC_BACKEND_VULKAN)
2447 bi.compute_device = "vulkan";
2448#else
2449 bi.compute_device = "cpu";
2450#endif
2451 bi.model_format = "gguf";
2452
2453 if (state() != ModelState::COLD && model_ != nullptr) {
2454 bi.architecture = is_recurrent() ? "recurrent" : "transformer";
2457 bi.parameter_count = llama_model_n_params(model_);
2458 bi.vram_bytes = 0;
2459 bi.ram_bytes = llama_model_size(model_);
2460
2461 char desc[256] = {};
2462 llama_model_desc(model_, desc, sizeof(desc));
2463 bi.quantization = desc;
2464 }
2465 return bi;
2466}
2467
2476 if (ctx_ == nullptr) {
2477 return false;
2478 }
2479 auto mem = llama_get_memory(ctx_);
2480 if (seq_id < 0) {
2481 llama_memory_clear(mem, true);
2482 } else {
2483 llama_memory_seq_rm(mem, seq_id, -1, -1);
2484 }
2485 return true;
2486}
2487
2488} // namespace entropic
std::string last_error_
Last error message for diagnostics.
Definition backend.h:611
bool is_active() const
True when state is ACTIVE.
Definition backend.h:224
ModelState state() const
Current lifecycle state (lock-free read).
Definition backend.h:216
const ModelConfig & config() const
Stored model config.
Definition backend.h:278
int context_length() const
Model's context window size.
Definition backend.h:257
LlamaCppBackend — common llama.cpp patterns (15% layer).
bool load_gpu_model()
Load the GGUF model onto the GPU (do_activate step 1).
bool do_load(const ModelConfig &config) override
Load model into CPU RAM (COLD → WARM).
bool do_supports(BackendCapability cap) const override
Declare llama.cpp backend capabilities.
GenerationResult decode_loop(const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
Core decode loop — shared by generate and streaming.
bool is_recurrent_
True if loaded model is recurrent (GDN/Mamba/RWKV).
LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens) override
Evaluate per-token log-probabilities via sequential decode.
std::string do_backend_name() const override
Return backend name.
std::unique_ptr< PromptCache > prompt_cache_
KV prefix cache (v1.8.3)
std::string step_token(llama_sampler *sampler, std::string &generated, std::function< void(std::string_view)> &on_token, const std::vector< std::string > &stop)
Generate one token and append to output.
GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams &params) override
Generate a complete response using chat template.
GenerationResult do_complete(const std::string &prompt, const GenerationParams &params) override
Raw text completion without chat template.
std::vector< llama_token > tokenize(const std::string &text, bool add_special) const
Tokenize text using model vocabulary.
bool create_inference_context()
Create the llama context + prompt cache (do_activate step 2).
const llama_vocab * vocab_
Vocabulary (from model_)
int compute_prefix_token_count(const std::vector< Message > &messages, const GenerationParams &params)
Compute token count of system messages only.
std::string detokenize(llama_token token) const
Detokenize a single token.
void init_mmproj_if_configured()
Initialize the libmtmd context if mmproj is configured.
GenerationResult generate_speculative_with_draft(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, LlamaCppBackend &draft, int n_draft_max, const std::string &draft_path)
Speculative-decoding kernel with explicit draft backend.
llama_context * ctx_
Inference context (ACTIVE)
bool run_prefill(const std::vector< llama_token > &tokens)
Run batched prefill on input tokens.
GenerationResult run_sampling_loop(const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel, const std::chrono::steady_clock::time_point &t0)
Sample tokens until stop / max_tokens / cancel.
bool restore_cached_prefix(const CacheEntry *cached, const std::vector< llama_token > &tokens)
Restore KV state from cache and decode remaining tokens.
void save_prefix_to_cache(const CacheKey &key, int prefix_tokens)
Capture seq 0 KV state and store under the given key.
std::vector< int32_t > tokenize_text(const std::string &text) const override
Tokenize text to token IDs using model vocabulary.
bool is_recurrent() const
Check if loaded model is recurrent.
entropic_error_t mtmd_prefill(const std::string &prompt, const std::vector<::mtmd_bitmap * > &bitmaps, std::string &err_msg)
Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.
bool run_prefill_cached(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
Run prefill with prompt cache integration.
GenerationResult do_generate_text_only(const std::vector< Message > &messages, const GenerationParams &params)
Text-only batch generation (extracted from do_generate).
std::string apply_chat_template(const std::vector< Message > &messages, const GenerationParams &params) const
Apply chat template to messages.
GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Streaming generation with per-token callback.
bool has_vision_
Cached mtmd_support_vision(mtmd_ctx_) result.
bool decode_tokens_from(const std::vector< llama_token > &tokens, int start_offset)
Decode tokens starting at a given offset.
void release_temp_seq_id(llama_seq_id seq_id)
Release a temporary sequence ID back to the pool.
int do_count_tokens(const std::string &text) const override
Count tokens in text.
::mtmd_context * mtmd_ctx_
libmtmd context, or nullptr if no mmproj loaded.
GenerationResult generate_multimodal(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel)
Multimodal generation core (v1.9.11 Phases 5–7).
std::mutex seq_id_mutex_
Guards temp seq_id pool (v1.9.10)
bool do_clear_state(int seq_id) override
Clear KV cache or recurrent hidden state.
static float extract_token_logprob(const float *logits, int32_t next_token, int n_vocab)
Extract log-probability for a token from logits.
void do_deactivate() override
Deactivate: free context, reload model CPU-only.
llama_sampler * create_sampler(const GenerationParams &params) const
Create sampler chain from generation params.
BackendInfo do_info() const override
Populate backend metadata from llama.cpp model.
bool do_activate() override
Activate model on GPU (WARM → ACTIVE).
bool prefill_and_cache_prefix(const std::vector< llama_token > &tokens, int prefix_tokens, const CacheKey &key)
Two-pass prefill: prefix-only prefill → save → rest.
llama_seq_id allocate_temp_seq_id()
Allocate a temporary sequence ID for evaluation.
PromptCacheConfig prompt_cache_config_
Cache config (v1.8.3)
void do_unload() override
Full unload — free all resources, clear prompt cache.
llama_model * model_
Loaded model (WARM+)
~LlamaCppBackend() override
Free llama.cpp + mtmd resources on destruction.
std::vector< llama_seq_id > free_seq_ids_
Available temporary seq_ids (v1.9.10)
GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel...
GenerationResult do_generate_streaming_text_only(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Text-only streaming generation (extracted from streaming).
static std::string extract_system_prompt(const std::vector< Message > &messages)
Extract the system prompt from messages.
static CacheKey make_key(std::string_view prompt_text, std::string_view model_path)
Compute a cache key from prompt text and model path.
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35
@ ENTROPIC_OK
Success.
Definition error.h:36
@ ENTROPIC_ERROR_CANCELLED
Operation cancelled via cancel token.
Definition error.h:48
@ ENTROPIC_ERROR_IMAGE_LOAD_FAILED
Image file could not be read or decoded (v1.9.11)
Definition error.h:80
@ ENTROPIC_ERROR_NOT_SUPPORTED
Capability not supported by this backend (v1.9.13)
Definition error.h:84
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
Definition error.h:42
@ ENTROPIC_ERROR_INVALID_STATE
Operation not valid in current state (e.g., generate before activate)
Definition error.h:39
LlamaCppBackend — llama.cpp C API integration.
spdlog initialization and logger access.
auto now()
Get current time for timing measurements.
Definition logging.h:193
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211
double elapsed_ms(std::chrono::steady_clock::time_point start, std::chrono::steady_clock::time_point end)
Compute elapsed milliseconds between two time points.
Definition logging.h:203
Activate model on GPU (WARM → ACTIVE).
@ IMAGE
Image content (local path or data URI)
BackendCapability
Capabilities that an inference backend may or may not support.
@ SPECULATIVE_DECODING
Speculative decoding compatibility.
@ HIDDEN_STATE
Recurrent hidden state management (save/load/reset)
@ VISION
Vision / multimodal input (v1.9.11)
@ KV_CACHE
KV cache state management (save/load/clear)
@ AUDIO
Audio input via mtmd audio projector (gh#53, v2.3.0)
@ _COUNT
Sentinel — must be last. Used for iteration/array sizing.
bool has_images(const std::vector< ContentPart > &parts)
Check if content parts contain any image parts.
Definition content.cpp:41
static bool spec_decode_both(SpeculativeRunState &state)
Decode the speculative batch on both contexts.
std::string extract_text(const std::vector< ContentPart > &parts)
Extract concatenated text from content parts.
Definition content.cpp:20
static void spec_ckpt_save_tgt(SpeculativeRunState &state)
Snapshot target state right before the target decode of the speculative batch (when use_ckpt_tgt + no...
static void spec_trim_rejected_drafts(SpeculativeRunState &state)
Clear any stale KV positions left by rejected draft tokens.
static bool spec_commit_accepted(SpeculativeRunState &state, const std::vector< llama_token > &ids, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Walk accepted ids, emit tokens via callback, update state.
static std::string spec_emit_token(SpeculativeRunState &state, llama_token id, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Emit on_token for one accepted id, updating state and returning a stop signal when terminating condit...
static void spec_rollback_partial(SpeculativeRunState &state, common_sampler *smpl_save, std::vector< llama_token > &ids)
Partial-acceptance rollback: restore both contexts and the sampler to their pre-draft state,...
static std::string spec_check_preconditions(bool target_active, bool draft_active, llama_context *ctx_tgt, llama_context *ctx_dft)
Validate speculative preconditions and reject NO-seq_rm.
@ COLD
On disk only, no RAM consumed.
static std::string concat_messages_fallback(const std::vector< Message > &messages)
Plain "role: content" join used when templating fails.
static int spec_run_draft(SpeculativeRunState &state)
Trigger draft generation via common_speculative_draft.
static void spec_run_loop(SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Run the accept-round loop until completion / EOS / cancel.
static std::vector< llama_chat_message > to_llama_chat(const std::vector< Message > &messages)
Apply GGUF-embedded chat template to messages.
static GenerationResult spec_finalize(SpeculativeRunState &state, std::chrono::steady_clock::time_point t0)
Speculative kernel against an explicit draft backend.
static bool spec_accept_round(SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Run one speculative accept round; return false to stop.
static GenerationResult spec_run_from_tokens(llama_context *ctx_tgt, llama_context *ctx_dft, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel, int n_draft_max, const std::string &draft_path, std::chrono::steady_clock::time_point t0)
Public entry point for the speculative-decoding kernel.
static void spec_build_batch(SpeculativeRunState &state)
Build the target batch [id_last, draft0, ..., draftN-1].
static int spec_prepare_draft(SpeculativeRunState &state)
Drive one accept round: optional draft generation, decode on both contexts, sample-and-accept,...
static void spec_ckpt_save_dft(SpeculativeRunState &state)
Drive one accept round: draft → decode → sample-and-accept → emit tokens.
static void spec_cleanup(SpeculativeRunState &state)
Free everything allocated by the kernel.
static std::string spec_init_sampler_and_decoder(SpeculativeRunState &state, llama_model *model_tgt, const GenerationParams &params, int n_draft_max, const std::string &draft_path)
Initialize the kernel state: clear KV, prefill, sampler, speculative context, batch,...
static void spec_ckpt_restore_dft(SpeculativeRunState &state)
Restore the draft's pre-draft state so the upcoming target-batch decode on the draft re-fills cleanly...
static std::string spec_init_run(SpeculativeRunState &state, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams &params, int n_draft_max, const std::string &draft_path)
Initialize speculative run state (prefill + sampler + decoder).
Backend metadata for introspection.
size_t ram_bytes
RAM consumed by loaded model (bytes). 0 if COLD.
int max_context_length
Maximum context length.
size_t parameter_count
Number of parameters (from model metadata).
std::string architecture
Architecture family of the loaded model.
std::string compute_device
"cuda", "vulkan", "cpu", "npu"
std::string name
Backend identifier (e.g. "llama.cpp", "axcl")
std::string quantization
Quantization type (e.g. "IQ3_XXS", "Q8_0", "fp16").
size_t vram_bytes
VRAM consumed by loaded model (bytes). 0 if COLD.
std::string model_format
"gguf", "axmodel", "onnx", etc.
Single cached KV state snapshot.
std::vector< uint8_t > data
Raw KV cache bytes.
size_t data_size
data.size() for quick byte accounting
int token_count
Prompt tokens covered by this entry.
64-bit hash used as cache lookup key.
Generation parameters for a single inference call.
Definition config.h:227
std::string grammar
GBNF grammar string (empty = unconstrained)
Definition config.h:240
int top_k
Top-K sampling.
Definition config.h:230
float repeat_penalty
Repetition penalty.
Definition config.h:231
float temperature
Sampling temperature.
Definition config.h:228
int max_tokens
Maximum tokens to generate.
Definition config.h:232
float top_p
Nucleus sampling threshold.
Definition config.h:229
int seed
RNG seed for reproducible sampling.
Definition config.h:237
std::vector< std::string > stop
Stop sequences.
Definition config.h:246
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
double generation_time_ms
Wall-clock generation time.
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string content
Generated text (cleaned by adapter)
std::string error_message
Error description (empty if no error)
int token_count
Generated token count.
Per-token log-probability evaluation result.
std::vector< float > logprobs
Log-prob for each token transition (N-1 values)
int n_logprobs
Number of logprob values (n_tokens - 1)
int n_tokens
Number of input tokens.
float total_logprob
Sum of all logprob values.
float perplexity
exp(-mean(logprobs)) — lower = less surprising
std::vector< int32_t > tokens
Input tokens echoed back for verification.
Model configuration for a single tier.
Definition config.h:148
std::filesystem::path mmproj_path
Vision projector GGUF path.
Definition config.h:174
int gpu_layers
GPU offload layers (-1 = all)
Definition config.h:152
int context_length
Context window size (512–131072)
Definition config.h:151
std::filesystem::path path
Resolved model file path.
Definition config.h:149
int n_threads
CPU threads (0 = auto-detect)
Definition config.h:161
std::string cache_type_k
KV cache key quantization type.
Definition config.h:158
std::string cache_type_v
KV cache value quantization type.
Definition config.h:159
int n_batch
Batch size for prompt processing.
Definition config.h:160
bool flash_attn
Enable flash attention.
Definition config.h:163
bool use_mlock
Lock model in system RAM.
Definition config.h:154
size_t max_bytes
Maximum cache RAM (512 MB default)
Definition config.h:197
bool log_hits
Log cache hit/miss at INFO level.
Definition config.h:199
bool enabled
Master switch (false = no caching)
Definition config.h:198
Bundles per-kernel-run mutable state to keep the loop body focused on its responsibility (knots: cogn...