entropic/llama__cpp__backend_8cpp_source.html

// SPDX-License-Identifier: Apache-2.0

#include "llama_cpp_backend.h"

#include "llama_cpp_sampler.h"

#include "llama_cpp_tokenizer.h"

#include "warm_keep_util.h"  // gh#96: common_prefix_len / warm_keep_cut

#include "tool_call_markers.h"  // gh#103: family-aware tool-call close marker

#include "batch_util.h"  // gh#98: batch_shared_prefix_len / batch_is_viable

#include "mtp_envelope.h"  // gh#108: mtp_unsupported_reason (fail-loud envelope)


#include <entropic/inference/adapters/adapter_base.h>  // gh#90 coerce_string_typed_args

#include <entropic/types/logging.h>


#include <common.h>

#include <chat.h>

#include <sampling.h>

#include <speculative.h>

#include <mtmd.h>

#include <mtmd-helper.h>


#include <nlohmann/json.hpp>


#include <cmath>

#include <cstring>

#include <optional>

#include <stdexcept>


namespace entropic {


namespace {


auto logger = entropic::log::get("inference.llama_cpp");


bool ends_with(const std::string& text, const std::string& suffix) {

    return text.size() >= suffix.size()

        && text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0;

}


bool check_stop_sequences(

    const std::string& text,

    const std::vector<std::string>& stop_sequences)

{

    for (const auto& stop : stop_sequences) {

        if (!stop.empty() && ends_with(text, stop)) {

            return true;

        }

    }

    return false;

}


GenerationResult prefill_error() {

    GenerationResult r;

    r.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

    r.error_message = "Prefill decode failed";

    r.finish_reason = "error";

    return r;

}


void log_sampler_config(const GenerationParams& params) {

    logger->info("Sampler: temp={:.2f}, top_k={}, top_p={:.2f}, "

                 "repeat_penalty={:.2f}, thinking={}",

                 params.temperature, params.top_k, params.top_p,

                 params.repeat_penalty, params.enable_thinking);

}


void finalize_result(GenerationResult& result,

    std::chrono::steady_clock::time_point start_time)

{

    auto end = entropic::log::now();

    result.generation_time_ms = entropic::log::elapsed_ms(

        start_time, end);

    if (result.token_count > 0 && result.generation_time_ms > 0.0) {

        result.throughput_tok_s =

            static_cast<double>(result.token_count)

            / result.generation_time_ms * 1000.0;

    }

    logger->info("Generated: {} tokens, finish={}, {:.0f}ms, "

                 "{:.1f} tok/s",

                 result.token_count, result.finish_reason,

                 result.generation_time_ms, result.throughput_tok_s);

    logger->info("Content:\n{}", result.content);

}


void finalize_generation(GenerationResult& result,

    const std::string& generated, int n_generated,

    const GenerationParams& params,

    std::chrono::steady_clock::time_point t0)

{

    if (n_generated >= params.max_tokens

            && result.finish_reason.empty()) {

        result.finish_reason = "length";

    }

    result.content = generated;

    result.token_count = n_generated;

    finalize_result(result, t0);

}


GenerationResult sampler_init_error(

    std::chrono::steady_clock::time_point t0)

{

    GenerationResult r;

    r.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

    r.error_message = "Sampler factory not initialized";

    r.finish_reason = "error";

    finalize_result(r, t0);

    return r;

}


ggml_type parse_kv_cache_type(const std::string& s) {

    static const std::pair<const char*, ggml_type> kTable[] = {

        {"f16",  GGML_TYPE_F16},

        {"f32",  GGML_TYPE_F32},

        {"bf16", GGML_TYPE_BF16},

        {"q8_0", GGML_TYPE_Q8_0},

        {"q4_0", GGML_TYPE_Q4_0},

    };

    for (const auto& [name, type] : kTable) {

        if (s == name) { return type; }

    }

    logger->warn("Unknown cache_type '{}' — defaulting to f16", s);

    return GGML_TYPE_F16;

}


llama_split_mode parse_split_mode(const std::string& s) {

    if (s.empty()) { return LLAMA_SPLIT_MODE_LAYER; }

    static const std::pair<const char*, llama_split_mode> kTable[] = {

        {"none",  LLAMA_SPLIT_MODE_NONE},

        {"layer", LLAMA_SPLIT_MODE_LAYER},

        {"row",   LLAMA_SPLIT_MODE_ROW},

    };

    for (const auto& [name, mode] : kTable) {

        if (s == name) { return mode; }

    }

    logger->warn("Unknown split_mode '{}' — defaulting to layer", s);

    return LLAMA_SPLIT_MODE_LAYER;

}


llama_model_params build_load_mparams(const entropic::ModelConfig& cfg) {

    llama_model_params m = llama_model_default_params();

    m.n_gpu_layers = cfg.gpu_layers;

    m.use_mmap = true;

    m.use_mlock = cfg.use_mlock;

    m.split_mode = parse_split_mode(cfg.split_mode);

    // gh#23 MVP item 7 (v2.3.19): main_gpu. Effective when split_mode

    // is "none" (pin) or "row" (small-tensor placement). 0 keeps

    // pre-v2.3.19 load bit-for-bit.

    m.main_gpu = cfg.main_gpu;

    return m;

}


} // anonymous namespace


// ── Lifecycle ──────────────────────────────────────────────


bool LlamaCppBackend::do_load(const ModelConfig& config) {

    llama_model_params mparams = llama_model_default_params();

    mparams.n_gpu_layers = 0;

    mparams.use_mmap = true;

    mparams.use_mlock = config.use_mlock;


    model_ = llama_model_load_from_file(config.path.c_str(), mparams);

    if (!model_) {

        last_error_ = "llama_model_load_from_file failed: " + config.path.string();

        return false;

    }


    vocab_ = llama_model_get_vocab(model_);

    is_recurrent_ = llama_model_is_recurrent(model_);

    is_hybrid_ = llama_model_is_hybrid(model_);  // gh#97: attn + recurrent/SSM

    // v2.3.10: wire the Tokenizer seam now that vocab_ is valid.

    // Lifetime: tokenizer_ borrows vocab_; do_unload resets

    // tokenizer_ BEFORE freeing the model so the borrow never dangles.

    tokenizer_ = std::make_unique<LlamaCppTokenizer>(vocab_);

    logger->info("Model loaded (CPU): {} tokens in vocab, recurrent={}",

              llama_vocab_n_tokens(vocab_), is_recurrent_);

    return true;

}


namespace {

llama_context_params build_cparams(const entropic::ModelConfig& cfg) {

    llama_context_params c = llama_context_default_params();

    c.n_ctx = static_cast<uint32_t>(cfg.context_length);

    c.n_batch = static_cast<uint32_t>(cfg.n_batch);

    // gh#23 MVP item 5 (v2.3.17): n_ubatch. 0 keeps llama.cpp's default

    // (== n_batch in practice), preserving pre-v2.3.17 chunking.

    if (cfg.n_ubatch > 0) {

        c.n_ubatch = static_cast<uint32_t>(cfg.n_ubatch);

    }

    c.n_threads = cfg.n_threads > 0

        ? static_cast<uint32_t>(cfg.n_threads)

        : std::thread::hardware_concurrency();

    c.flash_attn_type = cfg.flash_attn

        ? LLAMA_FLASH_ATTN_TYPE_ENABLED

        : LLAMA_FLASH_ATTN_TYPE_DISABLED;

    c.type_k = parse_kv_cache_type(cfg.cache_type_k);

    c.type_v = parse_kv_cache_type(cfg.cache_type_v);

    // gh#23 MVP item 8 (v2.3.20): offload_kqv. true (default) matches

    // llama.cpp's default — bit-identical for callers not opting out.

    c.offload_kqv = cfg.offload_kqv;

    // gh#23 MVP items 9 + 10 (v2.3.21 + v2.3.22): RoPE frequency

    // overrides. Both 0.0 = use model's trained value — bit-identical.

    c.rope_freq_base = cfg.rope_freq_base;

    c.rope_freq_scale = cfg.rope_freq_scale;

    // gh#23 MVP item 11 (v2.3.23): n_parallel maps to cparams.n_seq_max.

    // 1 (default) matches llama.cpp's default — bit-identical.

    c.n_seq_max = static_cast<uint32_t>(cfg.n_parallel);

    // gh#98 (v2.8.0): a unified KV buffer is REQUIRED for llama_memory_seq_cp

    // (the same-prefix batch fan-out) — seq_cp asserts on per-sequence buffers.

    // llama.cpp also recommends kv_unified exactly when sequences share a large

    // prefix (our case). Only enabled when batching is configured (n_parallel>1)

    // so single-sequence handles keep llama.cpp's default.

    c.kv_unified = (cfg.n_parallel > 1);

    // gh#108 (v2.9.2): llama_context_default_params() returns swa_full=true (a

    // full-context SWA cache), but the CLI default is false. For Gemma-4 (mostly

    // sliding-window: window=512, 5:1 SWA:global) the un-windowed cache wastes

    // ~5 GB at 128k. Set false — the memory-efficient windowed mode. Validated

    // against warm-keep / prompt-cache reuse over a >window prefix (the SWA layers

    // keep only the last `window` tokens, so KV reuse must not assume full-context

    // SWA residency — covered by the long-context warm-keep model test).

    c.swa_full = false;

    return c;

}

} // anonymous namespace


bool LlamaCppBackend::do_activate() {

    if (!load_gpu_model()) { return false; }

    if (!create_inference_context()) { return false; }

    // v2.3.10: wire the Sampler seam once ctx_ / vocab_ are live.

    // Lifetime: factory borrows ctx_ + vocab_; do_deactivate /

    // do_unload reset sampler_factory_ BEFORE freeing those handles

    // so the borrow never dangles.

    sampler_factory_ = std::make_unique<LlamaCppSamplerFactory>(

        ctx_, vocab_);

    init_mmproj_if_configured();

    return true;

}


bool LlamaCppBackend::load_gpu_model() {

    llama_model_params mparams = build_load_mparams(config());


    if (!config().tensor_split.empty()) {

        // TODO: parse tensor_split string into float array for multi-GPU

        logger->warn("tensor_split not yet implemented, ignoring");

    }


    // tokenizer_ borrows the old vocab_; reset it before the free so the

    // borrow never dangles. Then free the WARM model and null the

    // handles so a failed reload below leaves the backend in a clean,

    // recoverable state rather than a dangling one.

    tokenizer_.reset();

    if (model_ != nullptr) {

        llama_model_free(model_);

        model_ = nullptr;

        vocab_ = nullptr;

    }


    model_ = llama_model_load_from_file(config().path.c_str(), mparams);

    if (model_ == nullptr) {

        // llama.cpp returns null with no error string — the actual

        // reason (OOM, CUDA init failure, GGUF parse error, etc.) only

        // surfaces in ggml's log stream. Point the operator at it so

        // multi-handle GPU failures (gh#58 v2.2.7 follow-up) are

        // diagnosable without source-diving llama.cpp.

        last_error_ = "Failed to reload model with GPU layers "

                      "(path=" + config().path.string()

                    + ", gpu_layers=" + std::to_string(config().gpu_layers)

                    + ") — check llama_ggml.log in the engine's log_dir "

                      "for the underlying llama.cpp/CUDA error";

        return false;

    }


    vocab_ = llama_model_get_vocab(model_);

    tokenizer_ = std::make_unique<LlamaCppTokenizer>(vocab_);

    return true;

}


bool LlamaCppBackend::create_inference_context() {

    llama_context_params cparams = build_cparams(config());


    ctx_ = llama_init_from_model(model_, cparams);

    if (!ctx_) {

        last_error_ = "llama_init_from_model failed";

        return false;

    }


    logger->info("Context created: n_ctx={}, n_batch={}, "

                 "flash_attn={}, type_k={}, type_v={}",

                 config().context_length, config().n_batch,

                 config().flash_attn,

                 config().cache_type_k, config().cache_type_v);


    // Initialize prompt cache if not already created

    if (!prompt_cache_) {

        prompt_cache_ = std::make_unique<PromptCache>(

            prompt_cache_config_.max_bytes);

        logger->info("Prompt cache initialized: max_bytes={}",

                     prompt_cache_config_.max_bytes);

    }

    return true;

}


void LlamaCppBackend::init_mmproj_if_configured() {

    if (config().mmproj_path.empty()) {

        has_vision_ = false;

        return;

    }

    auto ctx_params = mtmd_context_params_default();

    ctx_params.use_gpu = (config().gpu_layers != 0);

    ctx_params.flash_attn_type = config().flash_attn

        ? LLAMA_FLASH_ATTN_TYPE_ENABLED

        : LLAMA_FLASH_ATTN_TYPE_DISABLED;

    ctx_params.print_timings = false;

    mtmd_ctx_ = mtmd_init_from_file(

        config().mmproj_path.c_str(), model_, ctx_params);

    if (mtmd_ctx_ == nullptr) {

        logger->error("mtmd_init_from_file failed for {} — "

                      "continuing in text-only mode",

                      config().mmproj_path.string());

        has_vision_ = false;

        return;

    }

    has_vision_ = mtmd_support_vision(mtmd_ctx_);

    logger->info("mmproj loaded from {} — vision={}",

                 config().mmproj_path.string(), has_vision_);

}


void LlamaCppBackend::teardown_mtp_draft() {

    if (mtp_draft_ctx_ != nullptr) {

        llama_free(mtp_draft_ctx_);

        mtp_draft_ctx_ = nullptr;

    }

    if (mtp_draft_model_ != nullptr) {

        llama_model_free(mtp_draft_model_);

        mtp_draft_model_ = nullptr;

    }

    mtp_head_path_.clear();

}


bool LlamaCppBackend::setup_mtp_draft(const std::string& head_path, int n_max) {

    mtp_n_max_ = (n_max > 0) ? n_max : 16;

    if (mtp_draft_ctx_ != nullptr && mtp_head_path_ == head_path) {

        return true;  // live head already bound to this ctx_

    }

    teardown_mtp_draft();

    return build_mtp_head(head_path);

}


bool LlamaCppBackend::build_mtp_head(const std::string& head_path) {

    if (ctx_ == nullptr) {

        last_error_ = "MTP setup requires an ACTIVE target context";

        return false;

    }

    if (head_path.empty()) {

        // gh#108: fail loud before llama_model_load_from_file("") — a bare

        // mtp=true with no draft.path is a config error, not a load to attempt.

        last_error_ = "MTP requires speculative.draft.path (the head GGUF); "

                      "none configured";

        return false;

    }

    llama_model_params mparams = llama_model_default_params();

    mparams.n_gpu_layers = config().gpu_layers;  // head is tiny — follow target

    mparams.use_mmap = true;

    mtp_draft_model_ = llama_model_load_from_file(head_path.c_str(), mparams);

    if (mtp_draft_model_ != nullptr) {

        llama_context_params cparams = build_cparams(config());

        cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP;

        cparams.ctx_other = ctx_;   // share the target's KV memory

        cparams.n_rs_seq = 0;

        mtp_draft_ctx_ = llama_init_from_model(mtp_draft_model_, cparams);

    }

    bool ok = (mtp_draft_ctx_ != nullptr);

    if (ok) {

        mtp_head_path_ = head_path;

        logger->info("MTP head ready: {} (n_max={}, ctx_other=target, "

                     "shared-KV)", head_path, mtp_n_max_);

    } else {

        last_error_ = "MTP head setup failed: " + head_path;

        teardown_mtp_draft();

    }

    return ok;

}


void LlamaCppBackend::do_deactivate() {

    // gh#108 (v2.9.1): serialise vs an in-flight generate_mtp — it holds

    // mtp_mutex_ across its decode, so this blocks until that decode finishes

    // before freeing the MTP head + ctx_ (no deactivate-during-generate UAF).

    std::lock_guard<std::mutex> lk(mtp_mutex_);

    // gh#106 (v2.9.0): the MTP head borrows ctx_ via ctx_other — free it

    // FIRST so the borrow never dangles past the context.

    teardown_mtp_draft();

    // v2.3.10: sampler factory borrows ctx_ + vocab_. Release it

    // BEFORE freeing the context so the borrow never dangles.

    sampler_factory_.reset();

    // v2.1.8: mtmd holds a reference to the live llama_model — free

    // it before the GPU model is unloaded below.

    if (mtmd_ctx_ != nullptr) {

        mtmd_free(mtmd_ctx_);

        mtmd_ctx_ = nullptr;

        has_vision_ = false;

    }

    if (ctx_) {

        llama_free(ctx_);

        ctx_ = nullptr;

    }

    invalidate_resident_kv();  // gh#96: KV is gone with the context


    // Free the GPU model FIRST (releasing VRAM — the point of

    // deactivate), then reload CPU-only for the WARM state. tokenizer_

    // borrows the old vocab_, so reset it before the free.

    tokenizer_.reset();

    if (model_ != nullptr) {

        llama_model_free(model_);

        model_ = nullptr;

        vocab_ = nullptr;

    }

    reload_model_cpu_only();

}


void LlamaCppBackend::reload_model_cpu_only() {

    llama_model_params mparams = llama_model_default_params();

    mparams.n_gpu_layers = 0;

    mparams.use_mmap = true;

    mparams.use_mlock = config().use_mlock;


    model_ = llama_model_load_from_file(config().path.c_str(), mparams);

    if (model_ != nullptr) {

        vocab_ = llama_model_get_vocab(model_);

        tokenizer_ = std::make_unique<LlamaCppTokenizer>(vocab_);

    } else {

        // VRAM is released, but the warm-reload failed: leave the handle

        // null (state stays recoverable — the next activate reloads from

        // scratch). Error, not warn: a same-file CPU reload failing here

        // signals real trouble (disk/OOM).

        logger->error("Failed to reload CPU model during deactivate "

                      "(path={}); backend left unloaded until next activate",

                      config().path.string());

    }

}


LlamaCppBackend::~LlamaCppBackend() {

    do_unload();

}


void LlamaCppBackend::inject_tokenizer_for_test(

    std::unique_ptr<Tokenizer> tokenizer)

{

    tokenizer_ = std::move(tokenizer);

    state_.store(ModelState::WARM, std::memory_order_release);

}


void LlamaCppBackend::inject_sampler_factory_for_test(

    std::unique_ptr<SamplerFactory> factory)

{

    sampler_factory_ = std::move(factory);

}


void LlamaCppBackend::do_unload() {

    // gh#108 (v2.9.1): serialise vs in-flight generate_mtp (see do_deactivate).

    std::lock_guard<std::mutex> lk(mtp_mutex_);

    if (prompt_cache_) {

        prompt_cache_->clear();

    }

    // gh#106 (v2.9.0): MTP head borrows ctx_ — free it before the context.

    teardown_mtp_draft();

    // v2.3.10: sampler factory borrows ctx_ + vocab_ — release it

    // BEFORE the context/model are freed below so the borrow never

    // points into freed memory. (do_deactivate normally releases

    // this earlier; this reset is the WARM→COLD safety net.)

    sampler_factory_.reset();

    // v2.3.10: tokenizer borrows vocab_ — release it BEFORE the model

    // is freed so the borrow never points into freed memory.

    tokenizer_.reset();

    // v2.1.8: mtmd must be freed before the underlying llama_model.

    if (mtmd_ctx_ != nullptr) {

        mtmd_free(mtmd_ctx_);

        mtmd_ctx_ = nullptr;

        has_vision_ = false;

    }

    if (ctx_) {

        llama_free(ctx_);

        ctx_ = nullptr;

    }

    invalidate_resident_kv();  // gh#96: KV is gone with the context

    if (model_) {

        llama_model_free(model_);

        model_ = nullptr;

    }

    vocab_ = nullptr;

}


// ── Tokenization ───────────────────────────────────────────


std::vector<llama_token> LlamaCppBackend::tokenize(

    const std::string& text, bool add_special) const

{

    // v2.3.10: route through the Tokenizer seam. tokenizer_ is set

    // in do_load (real impl) or via inject_tokenizer_for_test (mock).

    // Returns empty when no tokenizer is wired — matches the prior

    // failure-path return shape.

    if (!tokenizer_) { return {}; }

    auto ids = tokenizer_->tokenize(text, add_special);

    // llama_token is int32_t; vector conversion is a copy through

    // iterators since the value type matches.

    return {ids.begin(), ids.end()};

}


std::string LlamaCppBackend::detokenize(llama_token token) const {

    // v2.3.10: route through Tokenizer seam. The special=false /

    // gh#68 history + defensive rationale now lives in

    // LlamaCppTokenizer::detokenize. Returns empty when no

    // tokenizer is wired — matches prior failure-path return.

    if (!tokenizer_) { return {}; }

    return tokenizer_->detokenize(static_cast<int32_t>(token));

}


int LlamaCppBackend::do_count_tokens(const std::string& text) const {

    auto tokens = tokenize(text, false);

    return static_cast<int>(tokens.size());

}


std::vector<int32_t> LlamaCppBackend::tokenize_text(

    const std::string& text) const {

    auto tokens = tokenize(text, true);

    return {tokens.begin(), tokens.end()};

}


// ── Evaluation (v1.9.10) ──────────────────────────────────


LogprobResult LlamaCppBackend::do_evaluate_logprobs(

    const int32_t* tokens,

    int n_tokens)

{

    int n_vocab = llama_vocab_n_tokens(vocab_);

    LogprobResult result;

    result.tokens.assign(tokens, tokens + n_tokens);

    result.n_tokens = n_tokens;

    result.n_logprobs = n_tokens - 1;

    result.logprobs.reserve(result.n_logprobs);


    auto* mem = llama_get_memory(ctx_);

    llama_memory_clear(mem, true);


    for (int i = 0; i < n_tokens; i++) {

        llama_token tok = tokens[i];

        llama_batch batch = llama_batch_get_one(&tok, 1);

        int rc = llama_decode(ctx_, batch);

        if (rc != 0) {

            llama_memory_clear(mem, true);

            throw std::runtime_error("llama_decode failed at logprob pos");

        }

        if (i < n_tokens - 1) {

            const float* logits = llama_get_logits_ith(ctx_, -1);

            float lp = extract_token_logprob(

                logits, tokens[i + 1], n_vocab);

            result.logprobs.push_back(lp);

        }

    }


    float sum = 0.0f;

    for (float lp : result.logprobs) { sum += lp; }

    result.total_logprob = sum;

    result.perplexity = std::exp(

        -sum / static_cast<float>(result.n_logprobs));


    llama_memory_clear(mem, true);

    return result;

}


llama_seq_id LlamaCppBackend::allocate_temp_seq_id() {

    std::lock_guard<std::mutex> lock(seq_id_mutex_);

    if (!free_seq_ids_.empty()) {

        auto id = free_seq_ids_.back();

        free_seq_ids_.pop_back();

        return id;

    }

    return next_temp_seq_id_++;

}


void LlamaCppBackend::release_temp_seq_id(llama_seq_id seq_id) {

    std::lock_guard<std::mutex> lock(seq_id_mutex_);

    free_seq_ids_.push_back(seq_id);

}


float LlamaCppBackend::extract_token_logprob(

    const float* logits,

    int32_t next_token,

    int n_vocab)

{

    float max_logit = logits[0];

    for (int v = 1; v < n_vocab; v++) {

        if (logits[v] > max_logit) {

            max_logit = logits[v];

        }

    }

    float sum_exp = 0.0f;

    for (int v = 0; v < n_vocab; v++) {

        sum_exp += std::exp(logits[v] - max_logit);

    }

    float log_sum_exp = max_logit + std::log(sum_exp);

    return logits[next_token] - log_sum_exp;

}


// ── Chat template ──────────────────────────────────────────


static std::vector<llama_chat_message> to_llama_chat(

    const std::vector<Message>& messages) {

    std::vector<llama_chat_message> chat_msgs;

    chat_msgs.reserve(messages.size());

    for (const auto& msg : messages) {

        chat_msgs.push_back({msg.role.c_str(), msg.content.c_str()});

    }

    return chat_msgs;

}


static std::vector<common_chat_msg> to_common_chat(

    const std::vector<Message>& messages) {

    std::vector<common_chat_msg> out;

    out.reserve(messages.size());

    for (const auto& msg : messages) {

        common_chat_msg cm;

        cm.role = msg.role;

        cm.content = msg.content;

        out.push_back(std::move(cm));

    }

    return out;

}


static std::vector<common_chat_tool> mcp_tools_to_common_chat(

    const std::string& tools_json) {

    std::vector<common_chat_tool> out;

    if (tools_json.empty()) { return out; }

    auto arr = nlohmann::json::parse(tools_json, nullptr, false);

    if (!arr.is_array()) { return out; }

    for (const auto& t : arr) {

        common_chat_tool ct;

        ct.name = t.value("name", "");

        ct.description = t.value("description", "");

        if (t.contains("inputSchema")) {

            ct.parameters = t["inputSchema"].dump();

        }

        if (!ct.name.empty()) { out.push_back(std::move(ct)); }

    }

    return out;

}


static ToolCall to_entropic_tool_call(const common_chat_tool_call& cc) {

    ToolCall tc;

    tc.id = cc.id;

    tc.name = cc.name;

    tc.arguments_json = cc.arguments;

    auto j = nlohmann::json::parse(cc.arguments, nullptr, false);

    if (j.is_object()) {

        for (auto it = j.begin(); it != j.end(); ++it) {

            tc.arguments[it.key()] =

                it->is_string() ? it->get<std::string>() : it->dump();

        }

    }

    return tc;

}


static std::optional<common_chat_params> render_common_chat(

    llama_model* model,

    const std::vector<Message>& messages,

    const GenerationParams& params,

    const std::vector<common_chat_tool>& tools) {

    if (model == nullptr) { return std::nullopt; }

    auto tmpls = common_chat_templates_init(model, "");

    std::optional<common_chat_params> out;

    if (tmpls) {

        common_chat_templates_inputs inputs;

        inputs.messages = to_common_chat(messages);

        inputs.add_generation_prompt = true;

        inputs.use_jinja = true;

        inputs.enable_thinking = params.enable_thinking;  // gh#86

        inputs.tools = tools;

        if (!tools.empty()) {

            inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;

        }

        try {

            out = common_chat_templates_apply(tmpls.get(), inputs);

        } catch (const std::exception& e) {

            logger->warn("jinja chat template apply failed ({}); "

                         "falling back to low-level template", e.what());

        }

    }

    return out;

}


static std::string concat_messages_fallback(

    const std::vector<Message>& messages) {

    std::string fallback;

    for (const auto& msg : messages) {

        fallback += msg.role + ": " + msg.content + "\n";

    }

    return fallback;

}


std::string LlamaCppBackend::apply_chat_template(

    const std::vector<Message>& messages,

    const GenerationParams& params) const

{

    auto rendered = render_common_chat(model_, messages, params, {});

    return rendered ? rendered->prompt

                    : apply_chat_template_lowlevel(messages);

}


std::string LlamaCppBackend::render_prompt(

    const std::vector<Message>& messages,

    const GenerationParams& params)

{

    if (!active_tools_json_.empty()) {

        return render_with_tools(messages, params);

    }

    have_chat_params_ = false;

    return apply_chat_template(messages, params);

}


void LlamaCppBackend::set_active_tools(const std::string& tools_json) {

    active_tools_json_ = tools_json;

    logger->info("Active tools staged for common_chat render: {} bytes",

                 tools_json.size());

}


std::string LlamaCppBackend::render_with_tools(

    const std::vector<Message>& messages,

    const GenerationParams& params)

{

    have_chat_params_ = false;

    auto tools = mcp_tools_to_common_chat(active_tools_json_);

    auto rendered = render_common_chat(model_, messages, params, tools);

    std::string prompt;

    if (rendered) {

        last_chat_format_ = static_cast<int>(rendered->format);

        last_generation_prompt_ = rendered->generation_prompt;

        last_parser_ = rendered->parser;

        have_chat_params_ = true;

        // gh#105: snapshot this TOOLED render for the engine's later re-parse.

        // A toolless interleave (validator critique) clears have_chat_params_

        // but NOT this snapshot, so parse_response still decodes the main call.

        parse_chat_format_ = last_chat_format_;

        parse_generation_prompt_ = last_generation_prompt_;

        parse_parser_ = last_parser_;

        parse_params_valid_ = true;

        prompt = rendered->prompt;

        logger->info("render_with_tools: format={}, {} tool(s), captured "

                     "parser ({} bytes)", last_chat_format_, tools.size(),

                     last_parser_.size());

    } else {

        prompt = apply_chat_template_lowlevel(messages);

    }

    return prompt;

}


bool LlamaCppBackend::common_chat_parse_reliable() const {

    // gh#105: read the sticky last-TOOLED snapshot, not the live capture — the

    // engine queries this AFTER a toolless validator render would have cleared

    // have_chat_params_, so the live flag is unreliable here.

    return parse_params_valid_

        && parse_chat_format_ == COMMON_CHAT_FORMAT_PEG_GEMMA4;

}


std::string LlamaCppBackend::tool_call_close_marker() const {

    // last_chat_format_ is stored as int (the captured common_chat_format).

    return have_chat_params_

        ? close_marker_for_format(

              static_cast<common_chat_format>(last_chat_format_))

        : "";

}


std::vector<std::string> LlamaCppBackend::effective_stop(

    const GenerationParams& params) const {

    GenerationParams p = params;

    const std::size_t before = p.stop.size();

    append_sequential_stop(p, tool_call_close_marker());

    if (p.stop.size() > before) {

        logger->info("Sequential tier: tool-call close marker injected "

                     "post-render (gh#105) — hard-stop at first tool call");

    }

    return p.stop;

}


void strip_thinking_channels(std::string& content, std::string* reasoning_out) {

    static const std::string kOpen = "<|channel>";

    static const std::string kClose = "<channel|>";

    bool stripped = false;

    bool truncated_unclosed = false;

    std::size_t pos;

    while ((pos = content.find(kOpen)) != std::string::npos) {

        stripped = true;

        std::size_t end = content.find(kClose, pos + kOpen.size());

        if (end == std::string::npos) { truncated_unclosed = true; }

        std::size_t span_end =

            (end == std::string::npos) ? content.size() : end + kClose.size();

        if (reasoning_out != nullptr) {

            std::size_t inner = pos + kOpen.size();

            std::size_t inner_end =

                (end == std::string::npos) ? content.size() : end;

            reasoning_out->append(content, inner, inner_end - inner);

        }

        content.erase(pos, span_end - pos);

    }

    if (stripped) {

        std::size_t nb = content.find_first_not_of(" \t\r\n");

        content.erase(0, nb == std::string::npos ? content.size() : nb);

    }

    if (truncated_unclosed && content.empty()) {

        logger->warn("strip_thinking_channels: generation hit max_tokens "

                     "while still inside a <|channel> reasoning block — no "

                     "answer was ever produced, so content is empty (not a "

                     "parse error). Raise max_tokens or investigate why this "

                     "config/prompt doesn't converge within budget.");

    }

}


LlamaCppBackend::CommonChatResult LlamaCppBackend::parse_response(

    const std::string& raw) const

{

    CommonChatResult result;

    // gh#105: decode from the sticky last-TOOLED snapshot (parse_*), NOT the

    // live capture — a toolless validator render between the main generation

    // and this re-parse would have cleared the live params.

    if (!parse_params_valid_) {

        result.content = raw;

        return result;

    }

    common_chat_parser_params pp;

    pp.format = static_cast<common_chat_format>(parse_chat_format_);

    pp.generation_prompt = parse_generation_prompt_;

    pp.parser.load(parse_parser_);  // mandatory — see header

    try {

        auto msg = common_chat_parse(raw, /*is_partial=*/false, pp);

        result.content = msg.content;

        result.reasoning_content = msg.reasoning_content;

        // gh#106: Gemma 4 QAT reasoning channels common_chat doesn't parse.

        strip_thinking_channels(result.content, &result.reasoning_content);

        for (const auto& tc : msg.tool_calls) {

            result.tool_calls.push_back(to_entropic_tool_call(tc));

        }

        // gh#90: gemma <|"|> string-escape loses type through PEG_GEMMA4 —

        // restore string typing for params the staged schema declares string.

        coerce_string_typed_args(result.tool_calls, active_tools_json_);

    } catch (const std::exception& e) {

        logger->warn("common_chat_parse failed ({}); raw kept as content",

                     e.what());

        result.content = raw;

    }

    return result;

}


std::string LlamaCppBackend::apply_chat_template_lowlevel(

    const std::vector<Message>& messages) const

{

    auto chat_msgs = to_llama_chat(messages);


    int n = llama_chat_apply_template(

        nullptr, chat_msgs.data(), chat_msgs.size(),

        true, nullptr, 0);

    if (n < 0) {

        logger->error("llama_chat_apply_template failed (size query)");

        return concat_messages_fallback(messages);

    }


    std::vector<char> buf(static_cast<size_t>(n + 1));

    int written = llama_chat_apply_template(

        nullptr, chat_msgs.data(), chat_msgs.size(),

        true, buf.data(), static_cast<int32_t>(buf.size()));

    if (written < 0) {

        logger->error("llama_chat_apply_template failed (render)");

        return concat_messages_fallback(messages);

    }


    return std::string(buf.data(), static_cast<size_t>(written));

}


// ── Sampler ────────────────────────────────────────────────


std::unique_ptr<Sampler> LlamaCppBackend::create_sampler(

    const GenerationParams& params) const

{

    if (!sampler_factory_) { return nullptr; }

    return sampler_factory_->create(params);

}


// ── Decode loop ────────────────────────────────────────────


bool LlamaCppBackend::run_prefill(const std::vector<llama_token>& tokens) {

    llama_memory_clear(llama_get_memory(ctx_), true);


    const int n_batch = config().n_batch;

    const int n_tokens = static_cast<int>(tokens.size());


    for (int i = 0; i < n_tokens; i += n_batch) {

        int chunk = std::min(n_batch, n_tokens - i);

        std::vector<llama_token> slice(

            tokens.begin() + i, tokens.begin() + i + chunk);

        llama_batch batch = llama_batch_get_one(

            slice.data(), static_cast<int32_t>(chunk));

        if (llama_decode(ctx_, batch) != 0) {

            logger->error("Prefill decode failed at offset {}", i);

            return false;

        }

    }

    last_prefill_tokens_ += n_tokens;  // gh#96: count tokens decoded in prefill

    return true;

}


std::string LlamaCppBackend::step_token(

    Sampler& sampler,

    std::string& generated,

    std::function<void(std::string_view)>& on_token,

    const std::vector<std::string>& stop)

{

    llama_token new_token = sampler.sample();


    if (new_token == llama_vocab_eos(vocab_)

        || llama_vocab_is_eog(vocab_, new_token)) {

        return "eos";

    }


    std::string piece = detokenize(new_token);

    generated += piece;

    if (on_token) {

        on_token(std::string_view(piece));

    }

    if (check_stop_sequences(generated, stop)) {

        return "stop";

    }


    llama_token tok = new_token;

    llama_batch single = llama_batch_get_one(&tok, 1);

    return (llama_decode(ctx_, single) == 0) ? "continue" : "error";

}


GenerationResult LlamaCppBackend::decode_loop(

    const std::vector<llama_token>& tokens,

    const GenerationParams& params,

    std::function<void(std::string_view)> on_token,

    std::atomic<bool>* cancel)

{

    // v2.3.10: Sampler seam — factory installed in do_activate.

    auto sampler = create_sampler(params);

    if (!sampler) {

        GenerationResult result;

        result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        result.error_message = "Sampler factory not initialized";

        result.finish_reason = "error";

        return result;

    }


    if (!run_prefill(tokens)) {

        GenerationResult result;

        result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        result.error_message = "Prefill decode failed";

        result.finish_reason = "error";

        return result;

    }


    return generate_after_prefill(*sampler, params, std::move(on_token), cancel);

}


GenerationResult LlamaCppBackend::generate_after_prefill(

    Sampler& sampler,

    const GenerationParams& params,

    std::function<void(std::string_view)> on_token,

    std::atomic<bool>* cancel)

{

    GenerationResult result;

    std::string generated;

    int n_generated = 0;

    const auto stop = effective_stop(params);  // gh#105: per-call sequential marker


    while (n_generated < params.max_tokens) {

        bool cancelled = cancel && cancel->load(std::memory_order_acquire);

        if (cancelled) {

            result.finish_reason = "cancelled";

            result.error_code = ENTROPIC_ERROR_CANCELLED;

            break;

        }


        auto status = step_token(sampler, generated, on_token, stop);

        if (status == "continue") {

            ++n_generated;

        } else {

            result.finish_reason = (status == "error") ? "error" : "stop";

            if (status == "error") {

                result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

            }

            break;

        }

    }


    if (n_generated >= params.max_tokens && result.finish_reason.empty()) {

        result.finish_reason = "length";

    }


    result.content = generated;

    result.token_count = n_generated;

    return result;

}


// ── gh#98: same-prefix multi-seq batched generation ────────


static GenerationResult batch_error_result(const std::string& msg) {

    GenerationResult e;

    e.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

    e.error_message = msg;

    e.finish_reason = "error";

    return e;

}


static void fill_batch_cell(llama_batch& b, int k, llama_token tok,

                            llama_pos pos, llama_seq_id seq, bool want_logits) {

    b.token[k] = tok;

    b.pos[k] = pos;

    b.n_seq_id[k] = 1;

    b.seq_id[k][0] = seq;

    b.logits[k] = want_logits ? 1 : 0;

}


bool LlamaCppBackend::prepare_batch_seqs(

    std::vector<BatchSeq>& seqs,

    const std::vector<GenerationParams>& params) {

    for (std::size_t i = 0; i < seqs.size(); ++i) {

        seqs[i].sampler = create_sampler(params[i]);

        auto* ls = dynamic_cast<LlamaCppSampler*>(seqs[i].sampler.get());

        if (ls == nullptr) { return false; }

        seqs[i].chain = ls->native_chain();

        seqs[i].seq_id = (i == 0) ? 0 : allocate_temp_seq_id();

        seqs[i].max_tokens = params[i].max_tokens;

    }

    return true;

}


bool LlamaCppBackend::prefill_shared_and_fanout(

    std::vector<BatchSeq>& seqs, const std::vector<llama_token>& seq0,

    std::size_t shared) {

    std::vector<llama_token> prefix(

        seq0.begin(), seq0.begin() + static_cast<long>(shared));

    if (!decode_tokens_from(prefix, 0)) { return false; }  // into seq 0

    auto* mem = llama_get_memory(ctx_);

    for (std::size_t i = 1; i < seqs.size(); ++i) {

        llama_memory_seq_cp(mem, 0, seqs[i].seq_id, 0,

                            static_cast<llama_pos>(shared));

    }

    for (auto& s : seqs) { s.pos = static_cast<int>(shared); }

    return true;

}


bool LlamaCppBackend::prefill_batch_suffixes(

    std::vector<BatchSeq>& seqs,

    const std::vector<std::vector<llama_token>>& toks,

    std::size_t shared) {

    int total = 0;

    // shared <= shortest-1 < every t.size() by batch_shared_prefix_len, but

    // guard the unsigned subtraction defensively (a bad `shared` would else

    // underflow to a huge alloc).

    for (const auto& t : toks) {

        total += static_cast<int>(t.size() - std::min(shared, t.size()));

    }

    llama_batch batch = llama_batch_init(total, 0,

                                         static_cast<int32_t>(seqs.size()));

    int k = 0;

    for (std::size_t i = 0; i < seqs.size(); ++i) {

        int len = static_cast<int>(toks[i].size());

        for (int p = static_cast<int>(shared); p < len; ++p) {

            fill_batch_cell(batch, k, toks[i][p], p, seqs[i].seq_id,

                            p == len - 1);

            if (p == len - 1) { seqs[i].logits_idx = k; }

            ++k;

        }

        seqs[i].pos = len;

    }

    batch.n_tokens = k;

    last_prefill_tokens_ += k;

    bool ok = (llama_decode(ctx_, batch) == 0);

    llama_batch_free(batch);

    return ok;

}


void LlamaCppBackend::sample_batch_active(std::vector<BatchSeq>& seqs) {

    for (auto& s : seqs) {

        if (!s.active) { continue; }

        // llama_sampler_sample() accepts the drawn token into the chain

        // internally (advancing grammar/penalties) — matching the single-seq

        // step_token path. A second accept would double-advance the grammar.

        llama_token tok = llama_sampler_sample(s.chain, ctx_, s.logits_idx);

        if (llama_vocab_is_eog(vocab_, tok)) {

            s.active = false;

            s.finish = "stop";

            continue;

        }

        s.out.push_back(tok);

        ++s.n_gen;

        if (s.n_gen >= s.max_tokens) { s.active = false; s.finish = "length"; }

    }

}


void LlamaCppBackend::run_batch_gen_loop(

    std::vector<BatchSeq>& seqs, int max_steps, std::atomic<bool>& cancel) {

    llama_batch batch = llama_batch_init(static_cast<int32_t>(seqs.size()), 0,

                                         static_cast<int32_t>(seqs.size()));

    for (int step = 0; step < max_steps; ++step) {

        if (cancel.load(std::memory_order_acquire)) { break; }

        sample_batch_active(seqs);

        int k = 0;

        for (auto& s : seqs) {

            if (!s.active) { continue; }

            fill_batch_cell(batch, k, s.out.back(), s.pos, s.seq_id, true);

            s.logits_idx = k;

            ++s.pos;

            ++k;

        }

        if (k == 0) { break; }

        batch.n_tokens = k;

        ++last_gen_decode_calls_;

        if (llama_decode(ctx_, batch) != 0) { break; }

    }

    llama_batch_free(batch);

}


std::vector<GenerationResult> LlamaCppBackend::build_batch_results(

    std::vector<BatchSeq>& seqs) {

    std::vector<GenerationResult> out;

    out.reserve(seqs.size());

    for (auto& s : seqs) {

        GenerationResult r;

        for (llama_token t : s.out) { r.content += detokenize(t); }

        r.token_count = s.n_gen;

        r.finish_reason = s.finish;

        out.push_back(std::move(r));

    }

    return out;

}


void LlamaCppBackend::release_temp_seqs(std::vector<BatchSeq>& seqs) {

    for (std::size_t i = 1; i < seqs.size(); ++i) {

        if (seqs[i].seq_id != 0) { release_temp_seq_id(seqs[i].seq_id); }

    }

}


std::vector<GenerationResult> LlamaCppBackend::run_batched_decode(

    const std::vector<std::vector<llama_token>>& toks,

    const std::vector<GenerationParams>& params,

    std::size_t shared,

    std::atomic<bool>& cancel)

{

    const std::size_t n = toks.size();

    std::vector<BatchSeq> seqs(n);

    if (!prepare_batch_seqs(seqs, params)) {

        release_temp_seqs(seqs);  // don't leak ids allocated before the failure

        return std::vector<GenerationResult>(

            n, batch_error_result("batch sampler init"));

    }

    int max_steps = 0;

    for (const auto& p : params) { max_steps = std::max(max_steps, p.max_tokens); }


    llama_memory_clear(llama_get_memory(ctx_), true);

    invalidate_resident_kv();

    last_prefill_tokens_ = 0;

    last_gen_decode_calls_ = 0;


    bool ok = prefill_shared_and_fanout(seqs, toks[0], shared)

           && prefill_batch_suffixes(seqs, toks, shared);

    if (ok) { run_batch_gen_loop(seqs, max_steps, cancel); }


    auto out = ok ? build_batch_results(seqs)

                  : std::vector<GenerationResult>(

                        n, batch_error_result("batch prefill"));

    release_temp_seqs(seqs);

    invalidate_resident_kv();

    logger->info("gh#98 batch: requests={} prefix.tokens_shared={} "

                 "prefix.tokens_saved={} total_prefill_tokens={} gen_decodes={}",

                 n, shared, shared * (n - 1), last_prefill_tokens_,

                 last_gen_decode_calls_);

    return out;

}


std::vector<GenerationResult> LlamaCppBackend::do_generate_batch(

    const std::vector<std::vector<Message>>& requests,

    const std::vector<GenerationParams>& params,

    std::atomic<bool>& cancel)

{

    const std::size_t n = requests.size();

    std::vector<std::vector<llama_token>> toks(n);

    for (std::size_t i = 0; i < n; ++i) {

        toks[i] = tokenize(render_prompt(requests[i], params[i]), true);

    }

    const std::size_t shared = batch_shared_prefix_len(toks);

    std::size_t total_suffix = 0;

    for (const auto& t : toks) { total_suffix += t.size() - shared; }


    const bool hybrid = is_hybrid_ || is_recurrent_;

    if (!batch_is_viable(n, config().n_parallel, shared, hybrid,

                         total_suffix, config().n_batch)) {

        return InferenceBackend::do_generate_batch(requests, params, cancel);

    }

    return run_batched_decode(toks, params, shared, cancel);

}


// ── Prompt cache helpers ───────────────────────────────────


std::string LlamaCppBackend::extract_system_prompt(

    const std::vector<Message>& messages)

{

    for (const auto& msg : messages) {

        if (msg.role == "system") {

            return msg.content;

        }

    }

    return "";

}


bool LlamaCppBackend::decode_tokens_from(

    const std::vector<llama_token>& tokens, int start_offset)

{

    int total = static_cast<int>(tokens.size());

    if (start_offset >= total) { return true; }


    int n_batch = llama_n_batch(ctx_);

    int n_remaining = total - start_offset;

    last_prefill_tokens_ += n_remaining;  // gh#96: count tokens decoded here

    for (int off = 0; off < n_remaining; off += n_batch) {

        int chunk = std::min(n_batch, n_remaining - off);

        llama_batch batch = llama_batch_get_one(

            const_cast<llama_token*>(tokens.data())

                + start_offset + off,

            chunk);

        if (llama_decode(ctx_, batch) != 0) {

            logger->error("Decode chunk failed (start={}, off={}, "

                          "chunk={})", start_offset, off, chunk);

            return false;

        }

    }

    return true;

}


bool LlamaCppBackend::restore_cached_prefix(

    const CacheEntry* cached,

    const std::vector<llama_token>& tokens)

{

    auto* mem = llama_get_memory(ctx_);

    llama_memory_clear(mem, true);


    size_t restored = llama_state_seq_set_data(

        ctx_, cached->data.data(), cached->data_size, 0);

    if (restored == 0) {

        logger->warn("KV state restore failed, falling back to full prefill");

        return false;

    }


    return decode_tokens_from(tokens, cached->token_count);

}


void LlamaCppBackend::save_prefix_to_cache(

    const CacheKey& key, int prefix_tokens)

{

    size_t state_size = llama_state_seq_get_size(ctx_, 0);

    if (state_size == 0) {

        return;

    }


    std::vector<uint8_t> buf(state_size);

    size_t written = llama_state_seq_get_data(

        ctx_, buf.data(), buf.size(), 0);

    if (written > 0) {

        buf.resize(written);

        prompt_cache_->store(key, std::move(buf), prefix_tokens);

    }

}


int LlamaCppBackend::compute_prefix_token_count(

    const std::vector<Message>& messages,

    const GenerationParams& params)

{

    std::vector<Message> sys_msgs;

    for (const auto& msg : messages) {

        if (msg.role == "system") {

            sys_msgs.push_back(msg);

        }

    }

    if (sys_msgs.empty()) {

        return 0;

    }


    std::string sys_prompt = apply_chat_template(sys_msgs, params);

    auto sys_tokens = tokenize(sys_prompt, true);

    return static_cast<int>(sys_tokens.size());

}


bool LlamaCppBackend::prefill_and_cache_prefix(

    const std::vector<llama_token>& tokens,

    int prefix_tokens,

    const CacheKey& key)

{

    int total = static_cast<int>(tokens.size());

    if (prefix_tokens <= 0 || prefix_tokens >= total) {

        return run_prefill(tokens);

    }


    // Pass 1: prefill only the prefix — `run_prefill` calls

    // llama_memory_clear, so seq 0 ends up holding exactly

    // prefix_tokens positions.

    std::vector<llama_token> prefix(

        tokens.begin(), tokens.begin() + prefix_tokens);

    if (!run_prefill(prefix)) {

        return false;

    }


    // Save now: state contains exactly the prefix.

    save_prefix_to_cache(key, prefix_tokens);


    // Pass 2: continue prefilling the remainder. No clear — decode

    // appends after the saved prefix positions.

    return decode_tokens_from(tokens, prefix_tokens);

}


bool LlamaCppBackend::run_prefill_cached(

    const std::vector<llama_token>& tokens,

    const std::string& system_prompt,

    const std::vector<Message>& messages,

    const GenerationParams& params)

{

    // gh#96 (v2.7.5): count tokens actually pushed through llama_decode during

    // prefill this turn. run_prefill / decode_tokens_from accumulate into

    // last_prefill_tokens_; a prompt-cache HIT restores the system prefix

    // without a decode, so this counts the re-decoded post-system remainder —

    // the per-turn waste that climbs today and should collapse to the appended

    // delta once warm-keep reuse lands. (llama_perf n_p_eval proved unreliable

    // across the state-restore boundary, so we count the decodes directly.)

    last_prefill_tokens_ = 0;

    last_input_tokens_ = static_cast<int>(tokens.size());  // gh#97

    auto t_pre = entropic::log::now();

    bool ok;

    if (is_hybrid_ || is_recurrent_) {

        // gh#97 (v2.7.6): hybrid/recurrent (SSM) memory rejects the partial

        // seq_rm warm-keep needs (state can't be partially erased at the tail),

        // and the prompt-cache restore lands non-contiguous cells — both desync

        // KV positions (pos_max inflates → eventual decode slot-failure with the

        // cache mostly empty). Plain full prefill (clear + contiguous decode) is

        // the only correct path for these archs at this llama.cpp pin. Mirrors

        // the speculative-decoding guard. Forfeits the gh#96 reuse for them.

        ok = run_prefill(tokens);

        invalidate_resident_kv();

    } else {

        // gh#96 warm-keep: reuse the resident KV prefix + decode only the delta;

        // fall back to a cold prefill (clear + system-prefix cache) when reuse

        // is off, the prefix diverged, or the KV was mutated out-of-band.

        ok = try_warm_reuse(tokens);

        if (!ok) {

            ok = prefill_dispatch(tokens, system_prompt, messages, params);

            if (ok) {

                resident_tokens_ = tokens;

            } else {

                invalidate_resident_kv();

            }

        }

    }

    last_prefill_ms_ = entropic::log::elapsed_ms(t_pre, entropic::log::now());

    logger->info("Prefill (gh#96): {} tokens / {:.1f} ms decoded this turn",

                 last_prefill_tokens_, last_prefill_ms_);

    return ok;

}


bool LlamaCppBackend::try_warm_reuse(const std::vector<llama_token>& tokens) {

    if (!prompt_cache_config_.warm_keep || ctx_ == nullptr) {

        return false;

    }

    auto* mem = llama_get_memory(ctx_);

    long pos_max = static_cast<long>(llama_memory_seq_pos_max(mem, 0));

    std::size_t cut = warm_keep_cut(resident_tokens_, tokens, pos_max);

    if (cut == 0) {

        return false;  // nothing reusable — cold prefill

    }

    // Drop the divergent tail (and any prior generated tokens past `cut`),

    // then decode only the appended delta. A single exit (returns <= 3 gate):

    // success records the new resident set; failure invalidates and reports it.

    llama_memory_seq_rm(mem, 0, static_cast<llama_pos>(cut), -1);

    bool ok = decode_tokens_from(tokens, static_cast<int>(cut));

    if (ok) {

        resident_tokens_ = tokens;

        if (prompt_cache_config_.log_hits) {

            logger->info("Warm-keep: reused {} resident tokens, decoded {} "

                         "delta (of {} total)", cut, tokens.size() - cut,

                         tokens.size());

        }

    } else {

        invalidate_resident_kv();

    }

    return ok;

}


void LlamaCppBackend::invalidate_resident_kv() {

    resident_tokens_.clear();

}


bool LlamaCppBackend::prefill_dispatch(

    const std::vector<llama_token>& tokens,

    const std::string& system_prompt,

    const std::vector<Message>& messages,

    const GenerationParams& params)

{

    bool cache_enabled = prompt_cache_

        && prompt_cache_config_.enabled

        && !system_prompt.empty();


    if (!cache_enabled) {

        return run_prefill(tokens);

    }


    CacheKey key = PromptCache::make_key(

        system_prompt, config().path.string());

    const CacheEntry* cached = prompt_cache_->lookup(key);


    if (cached != nullptr) {

        if (prompt_cache_config_.log_hits) {

            logger->info("Prompt cache HIT: {} bytes, {} prefix tokens",

                         cached->data_size, cached->token_count);

        }

        if (restore_cached_prefix(cached, tokens)) {

            return true;

        }

        logger->warn("Cache restore failed, falling back to full prefill");

    } else if (prompt_cache_config_.log_hits) {

        logger->info("Prompt cache MISS: processing full prompt");

    }


    int prefix_tokens = compute_prefix_token_count(messages, params);

    return prefill_and_cache_prefix(tokens, prefix_tokens, key);

}


// ── Multimodal generation (v1.9.11 Phases 5–7 + v2.1.8) ────


namespace {


bool any_image_in(const std::vector<Message>& messages) {

    for (const auto& m : messages) {

        if (has_images(m.content_parts)) { return true; }

    }

    return false;

}


std::vector<Message> strip_image_parts(

    const std::vector<Message>& messages) {

    std::vector<Message> out = messages;

    for (auto& m : out) {

        if (m.content_parts.empty()) { continue; }

        m.content = extract_text(m.content_parts);

        m.content_parts.clear();

    }

    return out;

}


std::vector<Message> substitute_image_markers(

    const std::vector<Message>& messages,

    ::mtmd_context* ctx,

    std::vector<::mtmd_bitmap*>& bitmaps_out) {

    std::vector<Message> out;

    out.reserve(messages.size());

    const std::string marker = mtmd_default_marker();

    for (const auto& m : messages) {

        Message copy;

        copy.role = m.role;

        if (m.content_parts.empty()) {

            copy.content = m.content;

            out.push_back(std::move(copy));

            continue;

        }

        std::string built;

        for (const auto& p : m.content_parts) {

            if (p.type != ContentPartType::IMAGE) {

                built += p.text;

                continue;

            }

            ::mtmd_bitmap* bm = nullptr;

            if (!p.image_path.empty()) {

                bm = mtmd_helper_bitmap_init_from_file(

                    ctx, p.image_path.c_str(), /*placeholder=*/false).bitmap;

            }

            if (bm == nullptr) { return {}; }

            bitmaps_out.push_back(bm);

            built += marker;

        }

        copy.content = std::move(built);

        out.push_back(std::move(copy));

    }

    return out;

}


} // anonymous namespace


entropic_error_t LlamaCppBackend::mtmd_prefill(

    const std::string& prompt,

    const std::vector<::mtmd_bitmap*>& bitmaps,

    std::string& err_msg)

{

    llama_memory_clear(llama_get_memory(ctx_), true);

    ::mtmd_input_text mt{prompt.c_str(), true, true};

    auto* chunks = mtmd_input_chunks_init();

    std::vector<const ::mtmd_bitmap*> bm_cptrs(

        bitmaps.begin(), bitmaps.end());

    int32_t tok_rc = mtmd_tokenize(

        mtmd_ctx_, chunks, &mt, bm_cptrs.data(), bm_cptrs.size());

    if (tok_rc != 0) {

        mtmd_input_chunks_free(chunks);

        err_msg = "mtmd_tokenize failed (rc="

            + std::to_string(tok_rc) + ")";

        return ENTROPIC_ERROR_GENERATE_FAILED;

    }

    llama_pos new_n_past = 0;

    int32_t eval_rc = mtmd_helper_eval_chunks(

        mtmd_ctx_, ctx_, chunks, 0, 0,

        static_cast<int32_t>(config().n_batch),

        true, &new_n_past);

    mtmd_input_chunks_free(chunks);

    if (eval_rc != 0) {

        err_msg = "mtmd_helper_eval_chunks failed (rc="

            + std::to_string(eval_rc) + ")";

        return ENTROPIC_ERROR_GENERATE_FAILED;

    }

    logger->info("Multimodal prefill complete: n_past={}", new_n_past);

    return ENTROPIC_OK;

}


GenerationResult LlamaCppBackend::run_sampling_loop(

    const GenerationParams& params,

    std::function<void(std::string_view token)> on_token,

    std::atomic<bool>* cancel,

    const std::chrono::steady_clock::time_point& t0)

{

    GenerationResult result;

    // v2.3.10: Sampler seam.

    auto sampler = create_sampler(params);

    if (!sampler) {

        result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        result.error_message = "Sampler factory not initialized";

        result.finish_reason = "error";

        finalize_result(result, t0);

        return result;

    }

    std::string generated;

    int n_generated = 0;

    const auto stop = effective_stop(params);  // gh#105: per-call sequential marker

    while (n_generated < params.max_tokens) {

        if (cancel != nullptr

                && cancel->load(std::memory_order_acquire)) {

            result.finish_reason = "cancelled";

            result.error_code = ENTROPIC_ERROR_CANCELLED;

            break;

        }

        auto status = step_token(

            *sampler, generated, on_token, stop);

        if (status == "continue") { ++n_generated; continue; }

        result.finish_reason = (status == "error") ? "error" : "stop";

        if (status == "error") {

            result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        }

        break;

    }

    finalize_generation(result, generated, n_generated, params, t0);

    return result;

}


GenerationResult LlamaCppBackend::generate_multimodal(

    const std::vector<Message>& messages,

    const GenerationParams& params,

    std::function<void(std::string_view token)> on_token,

    std::atomic<bool>* cancel)

{

    auto t0 = entropic::log::now();

    invalidate_resident_kv();  // gh#96: mtmd_prefill mutates seq 0 out-of-band

    std::vector<::mtmd_bitmap*> bitmaps;

    auto marked = substitute_image_markers(

        messages, mtmd_ctx_, bitmaps);

    if (marked.empty()) {

        for (auto* b : bitmaps) { mtmd_bitmap_free(b); }

        GenerationResult err;

        err.error_code = ENTROPIC_ERROR_IMAGE_LOAD_FAILED;

        err.error_message =

            "mtmd_helper_bitmap_init_from_file failed";

        return err;

    }

    auto prompt = render_prompt(marked, params);

    logger->info("Multimodal generate: {} images, prompt={} chars, max_tokens={}",

                 bitmaps.size(), prompt.size(), params.max_tokens);

    std::string prefill_err;

    auto rc = mtmd_prefill(prompt, bitmaps, prefill_err);

    for (auto* b : bitmaps) { mtmd_bitmap_free(b); }

    if (rc != ENTROPIC_OK) {

        GenerationResult err;

        err.error_code = rc;

        err.error_message = std::move(prefill_err);

        return err;

    }

    return run_sampling_loop(params, on_token, cancel, t0);

}


// ── Generation entry points ────────────────────────────────


GenerationResult LlamaCppBackend::do_generate(

    const std::vector<Message>& messages,

    const GenerationParams& params)

{

    if (!any_image_in(messages)) {

        return do_generate_text_only(messages, params);

    }

    if (has_vision_ && mtmd_ctx_ != nullptr) {

        return generate_multimodal(messages, params, nullptr, nullptr);

    }

    logger->warn("Image content present but model has no vision "

                 "capability — stripping image parts");

    return do_generate_text_only(strip_image_parts(messages), params);

}


GenerationResult LlamaCppBackend::do_generate_text_only(

    const std::vector<Message>& messages,

    const GenerationParams& params)

{

    auto t0 = entropic::log::now();

    std::string prompt = render_prompt(messages, params);

    auto tokens = tokenize(prompt, true);

    std::string sys = extract_system_prompt(messages);


    logger->info("Generate: {} input tokens, max_tokens={}",

              tokens.size(), params.max_tokens);

    log_sampler_config(params);


    // v2.3.10: Sampler seam.

    auto sampler = create_sampler(params);

    if (!sampler) { return sampler_init_error(t0); }


    if (!run_prefill_cached(tokens, sys, messages, params)) {

        return prefill_error();

    }


    GenerationResult result;

    std::string generated;

    int n_generated = 0;

    std::function<void(std::string_view)> no_cb = nullptr;

    const auto stop = effective_stop(params);  // gh#105: per-call sequential marker


    while (n_generated < params.max_tokens) {

        auto status = step_token(

            *sampler, generated, no_cb, stop);

        if (status == "continue") { ++n_generated; }

        else {

            result.finish_reason =

                (status == "error") ? "error" : "stop";

            if (status == "error") {

                result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

            }

            break;

        }

    }


    finalize_generation(result, generated, n_generated, params, t0);

    return result;

}


GenerationResult LlamaCppBackend::do_generate(

    const std::vector<Message>& messages,

    const GenerationParams& params,

    std::atomic<bool>& cancel)

{

    if (!any_image_in(messages)) {

        return do_generate_text_only(messages, params, cancel);

    }

    if (has_vision_ && mtmd_ctx_ != nullptr) {

        return generate_multimodal(messages, params, nullptr, &cancel);

    }

    logger->warn("Image content present but model has no vision "

                 "capability — stripping image parts");

    return do_generate_text_only(strip_image_parts(messages), params, cancel);

}


GenerationResult LlamaCppBackend::do_generate_text_only(

    const std::vector<Message>& messages,

    const GenerationParams& params,

    std::atomic<bool>& cancel)

{

    auto t0 = entropic::log::now();

    std::string prompt = render_prompt(messages, params);

    auto tokens = tokenize(prompt, true);

    std::string sys = extract_system_prompt(messages);


    logger->info("Generate (cancellable): {} input tokens, max_tokens={}",

              tokens.size(), params.max_tokens);

    log_sampler_config(params);


    auto sampler = create_sampler(params);

    if (!sampler) { return sampler_init_error(t0); }


    if (!run_prefill_cached(tokens, sys, messages, params)) {

        return prefill_error();

    }


    GenerationResult result;

    std::string generated;

    int n_generated = 0;

    std::function<void(std::string_view)> no_cb = nullptr;


    const auto stop = effective_stop(params);  // gh#105: per-call sequential marker

    while (n_generated < params.max_tokens) {

        if (cancel.load(std::memory_order_acquire)) {

            result.finish_reason = "cancelled";

            result.error_code = ENTROPIC_ERROR_CANCELLED;

            break;

        }

        auto status = step_token(

            *sampler, generated, no_cb, stop);

        if (status == "continue") { ++n_generated; }

        else {

            result.finish_reason =

                (status == "error") ? "error" : "stop";

            if (status == "error") {

                result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

            }

            break;

        }

    }


    finalize_generation(result, generated, n_generated, params, t0);

    return result;

}


GenerationResult LlamaCppBackend::do_generate_streaming(

    const std::vector<Message>& messages,

    const GenerationParams& params,

    std::function<void(std::string_view token)> on_token,

    std::atomic<bool>& cancel)

{

    if (!any_image_in(messages)) {

        return do_generate_streaming_text_only(

            messages, params, on_token, cancel);

    }

    if (has_vision_ && mtmd_ctx_ != nullptr) {

        return generate_multimodal(messages, params, on_token, &cancel);

    }

    logger->warn("Image content present but model has no vision "

                 "capability — stripping image parts");

    return do_generate_streaming_text_only(

        strip_image_parts(messages), params, on_token, cancel);

}


GenerationResult LlamaCppBackend::do_generate_streaming_text_only(

    const std::vector<Message>& messages,

    const GenerationParams& params,

    std::function<void(std::string_view token)> on_token,

    std::atomic<bool>& cancel)

{

    auto t0 = entropic::log::now();

    auto prompt = render_prompt(messages, params);

    auto tokens = tokenize(prompt, true);

    auto sys = extract_system_prompt(messages);

    logger->info("Stream: {} input tokens, max_tokens={}",

              tokens.size(), params.max_tokens);

    log_sampler_config(params);


    // v2.3.10: Sampler seam.

    auto sampler = create_sampler(params);

    if (!sampler) { return sampler_init_error(t0); }

    if (!run_prefill_cached(tokens, sys, messages, params)) {

        return prefill_error();

    }

    GenerationResult result;

    std::string generated;

    int n_generated = 0;

    const auto stop = effective_stop(params);  // gh#105: per-call sequential marker

    while (n_generated < params.max_tokens) {

        if (cancel.load(std::memory_order_acquire)) {

            result.finish_reason = "cancelled";

            result.error_code = ENTROPIC_ERROR_CANCELLED;

            break;

        }

        auto status = step_token(

            *sampler, generated, on_token, stop);

        if (status == "continue") { ++n_generated; }

        else {

            result.finish_reason =

                (status == "error") ? "error" : "stop";

            if (status == "error") {

                result.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

            }

            break;

        }

    }

    finalize_generation(result, generated, n_generated, params, t0);

    return result;

}


GenerationResult LlamaCppBackend::do_generate_speculative(

    const std::vector<Message>& /*messages*/,

    const GenerationParams& /*params*/,

    std::function<void(std::string_view)> /*on_token*/,

    std::atomic<bool>& /*cancel*/)

{

    GenerationResult result;

    result.error_code = ENTROPIC_ERROR_NOT_SUPPORTED;

    result.error_message =

        "LlamaCppBackend speculative requires an explicit draft "

        "backend handle — orchestrator dispatches via "

        "generate_speculative_with_draft";

    result.finish_reason = "error";

    return result;

}


namespace {


common_params_sampling to_common_sampling(

    const GenerationParams& params) {

    common_params_sampling cps;

    cps.temp = params.temperature;

    cps.top_k = params.top_k;

    cps.top_p = params.top_p;

    cps.penalty_repeat = params.repeat_penalty;

    // gh#23 MVP items 2 + 3 (v2.3.14 + v2.3.15): wire presence +

    // frequency penalty into common-sampling. Counterparts of the

    // 3rd + 4th args to `llama_sampler_init_penalties` in the plain

    // decode path. Default 0.0f on both preserves bit-for-bit

    // speculative output.

    cps.penalty_freq    = params.frequency_penalty;

    cps.penalty_present = params.presence_penalty;

    // gh#23 MVP item 4 (v2.3.16): forward logit_bias to common-sampling.

    // Empty (default) leaves the speculative chain bit-for-bit

    // identical to pre-v2.3.16.

    for (auto& [tok, val] : params.logit_bias) {

        cps.logit_bias.push_back({tok, val});

    }

    if (params.seed >= 0) {

        cps.seed = static_cast<uint32_t>(params.seed);

    }

    cps.no_perf = true;

    // Mirror entropic's standard sampler chain ordering so the

    // speculative path produces output bit-identical to plain decode

    // (the v2.1.11 correctness contract). Entropic's `create_sampler`

    // builds: penalties → top_k → top_p → min_p → temperature → dist,

    // AND SKIPS the temperature sampler when temp == 0 (greedy mode).

    // common_sampler appends an extended-temperature sampler that

    // differs subtly from "no temp at all" — we omit it for temp=0

    // to match entropic exactly. min_p (v2.3.10, gh#23) appended only

    // when caller opted in (params.min_p > 0); 0.0 preserves the

    // pre-v2.3.10 chain shape bit-for-bit. Other extended filters

    // (top_n_sigma, dry, xtc, typical_p) remain stripped.

    cps.samplers = {COMMON_SAMPLER_TYPE_PENALTIES,

                    COMMON_SAMPLER_TYPE_TOP_K,

                    COMMON_SAMPLER_TYPE_TOP_P};

    if (params.min_p > 0.0f) {

        cps.samplers.push_back(COMMON_SAMPLER_TYPE_MIN_P);

    }

    if (params.temperature > 0.0f) {

        cps.samplers.push_back(COMMON_SAMPLER_TYPE_TEMPERATURE);

    }

    cps.min_p = params.min_p;

    cps.dry_multiplier = 0.0f;

    cps.top_n_sigma = -1.0f;

    return cps;

}


bool spec_prefill_minus_last(

    llama_context* ctx, const std::vector<llama_token>& tokens) {

    int total = static_cast<int>(tokens.size()) - 1;

    if (total <= 0) { return true; }

    int n_batch = llama_n_batch(ctx);

    for (int off = 0; off < total; off += n_batch) {

        int chunk = std::min(n_batch, total - off);

        llama_batch batch = llama_batch_get_one(

            const_cast<llama_token*>(tokens.data()) + off, chunk);

        if (llama_decode(ctx, batch) != 0) { return false; }

    }

    return true;

}


GenerationResult spec_error(entropic_error_t code, std::string msg) {

    GenerationResult r;

    r.error_code = code;

    r.error_message = std::move(msg);

    r.finish_reason = "error";

    return r;

}


} // anonymous namespace


struct SpeculativeRunState {

    common_speculative* spec = nullptr;

    common_sampler* smpl = nullptr;

    llama_context* ctx_tgt = nullptr;

    llama_context* ctx_dft = nullptr;

    llama_batch batch_tgt{};

    bool batch_initialized = false;

    llama_seq_id seq_id = 0;

    int n_past = 0;

    llama_token id_last = 0;

    std::vector<llama_token> prompt_tgt;

    std::vector<llama_token> draft;

    std::string generated;

    std::vector<std::string> stop;

    int n_generated = 0;

    int n_drafted = 0;

    int n_accepted = 0;

    bool has_eos = false;

    std::string finish_reason;

    entropic_error_t error_code = ENTROPIC_OK;

    std::string error_message;


    // ── Checkpoint state (v2.1.11) ──────────────────────────

    // Activated when either context reports FULL-only seq_rm

    // (no partial removal). The kernel saves+restores draft/target

    // state across each speculative round so the underlying

    // memory module never sees an attempted partial removal.

    // Mirrors the use_ckpt_tgt / use_ckpt_dft flow in upstream's

    // speculative-simple example.

    bool use_ckpt_tgt = false;

    bool use_ckpt_dft = false;

    common_prompt_checkpoint ckpt;

};


static void spec_cleanup(SpeculativeRunState& state) {

    if (state.spec) { common_speculative_free(state.spec); }

    if (state.smpl) { common_sampler_free(state.smpl); }

    if (state.batch_initialized) {

        llama_batch_free(state.batch_tgt);

    }

}


static void spec_build_batch(SpeculativeRunState& state) {

    common_batch_clear(state.batch_tgt);

    common_batch_add(state.batch_tgt, state.id_last,

                     state.n_past, {state.seq_id}, true);

    int pos = state.n_past + 1;

    for (auto draft_token : state.draft) {

        common_batch_add(state.batch_tgt, draft_token, pos,

                         {state.seq_id}, true);

        ++pos;

    }

}


static bool spec_decode_both(SpeculativeRunState& state) {

    spec_build_batch(state);

    int rc_tgt = llama_decode(state.ctx_tgt, state.batch_tgt);

    if (rc_tgt != 0) {

        logger->error("Speculative target decode failed: rc={}, "

                      "n_past={}, draft_size={}",

                      rc_tgt, state.n_past, state.draft.size());

        state.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        state.error_message = "target llama_decode failed";

        state.finish_reason = "error";

        return false;

    }

    int rc_dft = llama_decode(state.ctx_dft, state.batch_tgt);

    if (rc_dft != 0) {

        logger->error("Speculative draft decode failed: rc={}, "

                      "n_past={}, draft_size={}",

                      rc_dft, state.n_past, state.draft.size());

        state.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        state.error_message = "draft llama_decode failed";

        state.finish_reason = "error";

        return false;

    }

    return true;

}


static int spec_run_draft(SpeculativeRunState& state) {

    auto& dp = common_speculative_get_draft_params(

        state.spec, state.seq_id);

    dp.drafting = true;

    dp.n_max = -1;

    dp.n_past = state.n_past;

    dp.id_last = state.id_last;

    dp.prompt = &state.prompt_tgt;

    dp.result = &state.draft;

    common_speculative_draft(state.spec);

    return static_cast<int>(state.draft.size());

}


static std::string spec_emit_token(

    SpeculativeRunState& state, llama_token id,

    const llama_vocab* vocab, int max_tokens,

    std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel)

{

    std::string signal;

    state.prompt_tgt.push_back(state.id_last);

    state.id_last = id;

    state.n_generated++;

    if (llama_vocab_is_eog(vocab, id)) {

        state.has_eos = true;

        state.finish_reason = "stop";

        signal = "eos";

    } else {

        const std::string piece =

            common_token_to_piece(state.ctx_tgt, id);

        state.generated += piece;

        if (on_token) { on_token(piece); }

        // gh#108: honor stop sequences (params.stop + gh#103 sequential-tool

        // close marker) so MTP stops where plain decode would, instead of

        // over-generating past the first tool call. state.stop is empty for the

        // gh#36 path, so this is a no-op there.

        if (check_stop_sequences(state.generated, state.stop)) {

            state.finish_reason = "stop";

            signal = "stop";

        } else if (cancel.load(std::memory_order_acquire)) {

            state.error_code = ENTROPIC_ERROR_CANCELLED;

            state.finish_reason = "cancelled";

            signal = "cancel";

        } else if (state.n_generated >= max_tokens) {

            state.finish_reason = "length";

            signal = "length";

        }

    }

    return signal;

}


static void spec_ckpt_save_dft(SpeculativeRunState& state) {

    state.ckpt.update_pos(

        static_cast<int64_t>(state.prompt_tgt.size()),

        llama_memory_seq_pos_min(

            llama_get_memory(state.ctx_tgt), state.seq_id),

        llama_memory_seq_pos_max(

            llama_get_memory(state.ctx_tgt), state.seq_id));

    if (state.use_ckpt_dft) {

        state.ckpt.update_dft(state.ctx_dft, state.seq_id,

            LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY

                | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);

    }

}


static void spec_ckpt_save_tgt(SpeculativeRunState& state) {

    if (state.use_ckpt_tgt && !state.draft.empty()) {

        state.ckpt.update_tgt(state.ctx_tgt, state.seq_id,

            LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY

                | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);

    }

}


static void spec_ckpt_restore_dft(SpeculativeRunState& state) {

    constexpr auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY

                         | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE;

    if (state.use_ckpt_dft) {

        state.ckpt.load_dft(state.ctx_dft, state.seq_id, flags);

    }

    llama_memory_seq_rm(llama_get_memory(state.ctx_dft),

                        state.seq_id, state.ckpt.pos_max + 1, -1);

}


static void spec_rollback_partial(

    SpeculativeRunState& state, common_sampler* smpl_save,

    std::vector<llama_token>& ids) {

    constexpr auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY

                         | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE;

    state.draft = std::move(ids);

    state.ckpt.load_tgt(state.ctx_tgt, state.seq_id, flags);

    llama_memory_seq_rm(llama_get_memory(state.ctx_tgt),

                        state.seq_id, state.ckpt.pos_max + 1, -1);

    state.ckpt.load_dft(state.ctx_dft, state.seq_id, flags);

    llama_memory_seq_rm(llama_get_memory(state.ctx_dft),

                        state.seq_id, state.ckpt.pos_max + 1, -1);

    state.prompt_tgt.resize(static_cast<size_t>(state.ckpt.n_tokens));

    state.n_past = static_cast<int>(state.prompt_tgt.size());

    // Sampler clone is non-null only when use_ckpt_tgt is set

    common_sampler_free(state.smpl);

    state.smpl = smpl_save;

}


static void spec_trim_rejected_drafts(SpeculativeRunState& state) {

    llama_memory_seq_rm(llama_get_memory(state.ctx_tgt),

                        state.seq_id, state.n_past, -1);

    llama_memory_seq_rm(llama_get_memory(state.ctx_dft),

                        state.seq_id, state.n_past, -1);

}


static bool spec_commit_accepted(

    SpeculativeRunState& state,

    const std::vector<llama_token>& ids,

    const llama_vocab* vocab, int max_tokens,

    std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel) {

    bool stop = false;

    for (auto id : ids) {

        auto signal = spec_emit_token(

            state, id, vocab, max_tokens, on_token, cancel);

        if (!signal.empty()) { stop = true; break; }

    }

    return stop;

}


static int spec_prepare_draft(SpeculativeRunState& state) {

    // Skip drafting if the previous round restored a partial accept

    // into state.draft (carry-over from rollback).

    if (!state.draft.empty()) {

        return static_cast<int>(state.draft.size());

    }

    spec_ckpt_save_dft(state);

    int drafted = spec_run_draft(state);

    spec_ckpt_save_tgt(state);

    spec_ckpt_restore_dft(state);

    return drafted;

}


static bool spec_accept_round(

    SpeculativeRunState& state,

    const llama_vocab* vocab,

    int max_tokens,

    std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel)

{

    int draft_size_before = spec_prepare_draft(state);


    if (!spec_decode_both(state)) { return false; }


    common_sampler* smpl_save = nullptr;

    if (state.use_ckpt_tgt) {

        smpl_save = common_sampler_clone(state.smpl);

    }

    auto ids = common_sampler_sample_and_accept_n(

        state.smpl, state.ctx_tgt, state.draft);

    int accepted = static_cast<int>(ids.size()) - 1;

    if (accepted < 0) { accepted = 0; }


    // Partial acceptance on a FULL-seq_rm context: rollback to

    // checkpoint, set draft = accepted, re-loop without emitting.

    if (state.use_ckpt_tgt

        && static_cast<int>(ids.size()) - 1

               < static_cast<int>(state.draft.size())) {

        spec_rollback_partial(state, smpl_save, ids);

        return true;

    }

    if (smpl_save) { common_sampler_free(smpl_save); }


    common_speculative_accept(state.spec, state.seq_id, accepted);

    state.n_drafted += draft_size_before;

    state.n_accepted += accepted;

    // n_past advances by ids.size() total: one slot for id_last

    // (the post-id_last position the next id will occupy), plus

    // `accepted` slots for the drafted tokens the sampler agreed

    // with. Matches speculative-simple's n_past++ in batch_add +

    // n_past += ids.size() - 1 sequence.

    state.n_past += static_cast<int>(ids.size());


    bool stop = spec_commit_accepted(

        state, ids, vocab, max_tokens, on_token, cancel);

    state.draft.clear();

    spec_trim_rejected_drafts(state);

    return !stop;

}


static std::string spec_check_preconditions(

    bool target_active, bool draft_active,

    llama_context* ctx_tgt, llama_context* ctx_dft) {

    // Defense-in-depth arch gate — orchestrator's

    // check_speculative_compat is the primary gate; a direct caller

    // into the kernel must also be refused on recurrent / hybrid

    // targets (Session 5 Gate A: hybrid SSM state diverges across

    // split-prefill boundaries; bit-identical unreachable at this pin).

    std::string err;

    const llama_model* model_tgt = llama_get_model(ctx_tgt);

    int cap_tgt = common_context_can_seq_rm(ctx_tgt);

    int cap_dft = common_context_can_seq_rm(ctx_dft);

    logger->info("Speculative seq_rm capability: target={}, draft={} "

                 "(0=NO, 1=PART, 2=FULL)", cap_tgt, cap_dft);

    if (!target_active || !draft_active) {

        err = "speculative requires ACTIVE target + draft";

    } else if (llama_model_is_recurrent(model_tgt)

               || llama_model_is_hybrid(model_tgt)) {

        err = "speculative refused: architecture (target is "

              "recurrent or hybrid; see proposal Implementation "

              "Log Gate A)";

    } else if (cap_tgt == COMMON_CONTEXT_SEQ_RM_TYPE_NO

               || cap_dft == COMMON_CONTEXT_SEQ_RM_TYPE_NO) {

        // NO is the only unsupported seq_rm case — the kernel has

        // both PART fast-path and FULL checkpoint paths.

        err = "speculative kernel requires at least FULL seq_rm "

              "(target/draft reported NO seq_rm at all)";

    }

    return err;

}


static std::string spec_init_sampler_and_decoder(

    SpeculativeRunState& state, llama_model* model_tgt,

    const GenerationParams& params, int n_draft_max,

    const std::string& draft_path) {

    auto common_sampling = to_common_sampling(params);

    state.smpl = common_sampler_init(model_tgt, common_sampling);

    if (!state.smpl) { return "common_sampler_init failed"; }


    common_params_speculative spec_params;

    spec_params.types = {COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE};

    spec_params.draft.n_max = (n_draft_max > 0) ? n_draft_max : 16;

    spec_params.draft.ctx_tgt = state.ctx_tgt;

    spec_params.draft.ctx_dft = state.ctx_dft;

    // Upstream gates DRAFT_SIMPLE on a non-empty draft path

    // (see common/speculative.cpp:875). Required even though we

    // provide already-loaded contexts.

    spec_params.draft.mparams.path = draft_path;

    state.spec = common_speculative_init(spec_params, 1);

    if (!state.spec) {

        common_sampler_free(state.smpl);

        state.smpl = nullptr;

        return "common_speculative_init failed";

    }


    common_speculative_begin(state.spec, state.seq_id, state.prompt_tgt);

    state.batch_tgt = llama_batch_init(llama_n_batch(state.ctx_tgt), 0, 1);

    state.batch_initialized = true;

    // Checkpoint flow lights up when either context can only do

    // FULL-sequence removal. Mirrors speculative-simple's

    // use_ckpt_{tgt,dft}.

    state.use_ckpt_tgt = common_context_can_seq_rm(state.ctx_tgt)

        == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;

    state.use_ckpt_dft = common_context_can_seq_rm(state.ctx_dft)

        == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;

    return "";

}


static std::string spec_init_run(

    SpeculativeRunState& state, llama_model* model_tgt,

    const std::vector<llama_token>& tokens,

    const GenerationParams& params, int n_draft_max,

    const std::string& draft_path) {

    state.id_last = tokens.back();

    state.prompt_tgt.assign(tokens.begin(), tokens.end() - 1);

    state.n_past = static_cast<int>(tokens.size()) - 1;


    llama_memory_clear(llama_get_memory(state.ctx_tgt), true);

    llama_memory_clear(llama_get_memory(state.ctx_dft), true);


    if (!spec_prefill_minus_last(state.ctx_tgt, tokens)

        || !spec_prefill_minus_last(state.ctx_dft, tokens)) {

        return "speculative prefill failed";

    }

    return spec_init_sampler_and_decoder(

        state, model_tgt, params, n_draft_max, draft_path);

}


static void spec_run_loop(

    SpeculativeRunState& state, const llama_vocab* vocab,

    int max_tokens,

    std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel) {

    while (state.n_generated < max_tokens) {

        if (cancel.load(std::memory_order_acquire)) {

            state.error_code = ENTROPIC_ERROR_CANCELLED;

            state.finish_reason = "cancelled";

            break;

        }

        if (!spec_accept_round(state, vocab, max_tokens,

                               on_token, cancel)) {

            break;

        }

    }

    if (state.finish_reason.empty()) {

        state.finish_reason = (state.n_generated >= max_tokens)

                                  ? "length" : "stop";

    }

}


static GenerationResult spec_finalize(

    SpeculativeRunState& state,

    std::chrono::steady_clock::time_point t0) {

    GenerationResult result;

    result.content = state.generated;

    result.token_count = state.n_generated;

    result.finish_reason = state.finish_reason;

    result.error_code = state.error_code;

    result.error_message = state.error_message;

    // gh#106: surface the draft/accept counts so callers (and the MTP

    // engagement test) can verify the kernel actually ran + accepted.

    result.n_drafted = state.n_drafted;

    result.n_accepted = state.n_accepted;

    result.generation_time_ms =

        entropic::log::elapsed_ms(t0, entropic::log::now());

    // gh#108: the speculative path previously left throughput_tok_s=0.0 — the

    // one metric the feature exists for. Compute it like finalize_result.

    if (result.token_count > 0 && result.generation_time_ms > 0.0) {

        result.throughput_tok_s =

            static_cast<double>(result.token_count)

            / result.generation_time_ms * 1000.0;

    }

    if (state.n_drafted > 0) {

        const float accept_rate =

            static_cast<float>(state.n_accepted)

                / static_cast<float>(state.n_drafted);

        logger->info("Speculative: generated={}, drafted={}, "

                     "accepted={}, accept_rate={:.3f}",

                     state.n_generated, state.n_drafted,

                     state.n_accepted, accept_rate);

    }

    spec_cleanup(state);

    return result;

}


static GenerationResult spec_run_from_tokens(

    llama_context* ctx_tgt, llama_context* ctx_dft, llama_model* model_tgt,

    const std::vector<llama_token>& tokens, const GenerationParams& params,

    std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel, int n_draft_max,

    const std::string& draft_path,

    std::chrono::steady_clock::time_point t0) {

    SpeculativeRunState state;

    state.ctx_tgt = ctx_tgt;

    state.ctx_dft = ctx_dft;

    auto init_err = spec_init_run(state, model_tgt, tokens, params,

                                  n_draft_max, draft_path);

    if (!init_err.empty()) {

        spec_cleanup(state);

        return spec_error(ENTROPIC_ERROR_GENERATE_FAILED,

                          std::move(init_err));

    }

    spec_run_loop(state, llama_model_get_vocab(model_tgt),

                  params.max_tokens, on_token, cancel);

    return spec_finalize(state, t0);

}


GenerationResult LlamaCppBackend::generate_speculative_with_draft(

    const std::vector<Message>& messages,

    const GenerationParams& params,

    std::function<void(std::string_view)> on_token,

    std::atomic<bool>& cancel,

    LlamaCppBackend& draft,

    int n_draft_max,

    const std::string& draft_path)

{

    auto t0 = entropic::log::now();

    invalidate_resident_kv();  // gh#96: speculative path manages seq 0 itself

    auto pre_err = spec_check_preconditions(

        is_active(), draft.is_active(), ctx_, draft.ctx_);

    GenerationResult result;

    if (!pre_err.empty()) {

        entropic_error_t code =

            (pre_err.find("requires ACTIVE") != std::string::npos)

                ? ENTROPIC_ERROR_INVALID_STATE

                : ENTROPIC_ERROR_NOT_SUPPORTED;

        result = spec_error(code, std::move(pre_err));

    } else {

        auto prompt = render_prompt(messages, params);

        auto tokens = tokenize(prompt, true);

        if (tokens.size() < 2) {

            result = spec_error(ENTROPIC_ERROR_GENERATE_FAILED,

                "speculative prompt must have at least 2 tokens");

        } else {

            logger->info("Speculative: {} input tokens, max_tokens={}, "

                         "n_draft_max={}",

                         tokens.size(), params.max_tokens, n_draft_max);

            result = spec_run_from_tokens(

                ctx_, draft.ctx_, model_, tokens, params, on_token,

                cancel, n_draft_max, draft_path, t0);

        }

    }

    return result;

}


// ── gh#106 (v2.9.0): target-owned MTP speculative kernel ───────────

//

// Distinct from the gh#36 separate-draft kernel above. The MTP head

// (ctx_dft) shares the target's KV via ctx_other, so the CALLER only

// ever decodes ctx_tgt; the impl owns every ctx_dft decode. The loop is

// draft → decode(ctx_tgt) → process → sample_and_accept_n → accept,

// mirroring extern/llama.cpp/tools/server/server-context.cpp. Reuses the

// gh#36 file-local helpers (spec_build_batch / spec_emit_token /

// spec_commit_accepted / spec_trim_rejected_drafts / spec_finalize /

// spec_cleanup / spec_error / to_common_sampling) — only the decode step

// and the prefill differ. NO checkpoint dance: shared-KV gemma4 targets

// are PART-seq_rm, so the FULL-only rollback path never applies.


namespace {


int mtp_run_draft(SpeculativeRunState& state, int n_max) {

    auto& dp = common_speculative_get_draft_params(state.spec, state.seq_id);

    dp.drafting = true;

    dp.n_max = n_max;

    dp.n_past = state.n_past;

    dp.id_last = state.id_last;

    dp.prompt = &state.prompt_tgt;

    dp.result = &state.draft;

    common_speculative_draft(state.spec);

    return static_cast<int>(state.draft.size());

}


bool mtp_decode_and_process(SpeculativeRunState& state) {

    spec_build_batch(state);  // [id_last@n_past, draft@n_past+1 ...]

    if (llama_decode(state.ctx_tgt, state.batch_tgt) != 0) {

        state.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        state.error_message = "MTP target decode failed";

        state.finish_reason = "error";

        return false;

    }

    if (!common_speculative_process(state.spec, state.batch_tgt)) {

        state.error_code = ENTROPIC_ERROR_GENERATE_FAILED;

        state.error_message = "common_speculative_process failed";

        state.finish_reason = "error";

        return false;

    }

    return true;

}


bool mtp_accept_round(

    SpeculativeRunState& state, int n_max, const llama_vocab* vocab,

    int max_tokens, std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel) {

    int drafted = mtp_run_draft(state, n_max);

    if (!mtp_decode_and_process(state)) { return false; }

    auto ids = common_sampler_sample_and_accept_n(

        state.smpl, state.ctx_tgt, state.draft);

    int accepted = static_cast<int>(ids.size()) - 1;

    if (accepted < 0) { accepted = 0; }

    // gh#108: only accept into the spec when this round actually drafted.

    // common_speculative_accept asserts impl_last[seq] (speculative.cpp:1650),

    // which is set ONLY for a non-empty draft (1604/1614) — a zero-draft round

    // would abort. The round still progresses by one token (the bonus in ids),

    // and process() already updated pending_h, so skipping accept is equivalent.

    if (drafted > 0) {

        common_speculative_accept(state.spec, state.seq_id, accepted);

    }

    state.n_drafted += drafted;

    state.n_accepted += accepted;

    // Same layout as gh#36: id_last fills one slot at n_past, the

    // `accepted` drafts fill the next slots — n_past advances by ids.size().

    state.n_past += static_cast<int>(ids.size());

    bool stop = spec_commit_accepted(

        state, ids, vocab, max_tokens, on_token, cancel);

    state.draft.clear();

    spec_trim_rejected_drafts(state);

    return !stop;

}


bool mtp_process_chunk(SpeculativeRunState& state, int off, int chunk) {

    common_batch_clear(state.batch_tgt);

    for (int j = 0; j < chunk; ++j) {

        common_batch_add(state.batch_tgt, state.prompt_tgt[off + j],

                         off + j, {state.seq_id}, false);

    }

    if (llama_decode(state.ctx_tgt, state.batch_tgt) != 0) { return false; }

    return common_speculative_process(state.spec, state.batch_tgt);

}


bool mtp_prefill_and_seed(SpeculativeRunState& state) {

    int total = static_cast<int>(state.prompt_tgt.size());

    if (total == 0) { return true; }  // 1-token prompt: round 1 drafts cold

    int n_batch = llama_n_batch(state.ctx_tgt);

    for (int off = 0; off < total; off += n_batch) {

        int chunk = std::min(n_batch, total - off);

        if (!mtp_process_chunk(state, off, chunk)) { return false; }

    }

    return true;

}


std::string mtp_init_decoder(

    SpeculativeRunState& state, llama_model* model_tgt,

    const GenerationParams& params, int n_max) {

    auto common_sampling = to_common_sampling(params);

    state.smpl = common_sampler_init(model_tgt, common_sampling);

    if (!state.smpl) { return "common_sampler_init failed"; }

    common_params_speculative sp;

    sp.types = {COMMON_SPECULATIVE_TYPE_DRAFT_MTP};

    sp.draft.n_max = n_max;

    sp.draft.ctx_tgt = state.ctx_tgt;

    sp.draft.ctx_dft = state.ctx_dft;

    state.spec = common_speculative_init(sp, 1);

    if (!state.spec) {

        common_sampler_free(state.smpl);

        state.smpl = nullptr;

        return "common_speculative_init (MTP) failed";

    }

    state.batch_tgt = llama_batch_init(llama_n_batch(state.ctx_tgt), 0, 1);

    state.batch_initialized = true;

    return "";

}


std::string mtp_init_run(

    SpeculativeRunState& state, llama_model* model_tgt,

    const std::vector<llama_token>& tokens,

    const GenerationParams& params, int n_max) {

    state.id_last = tokens.back();

    state.prompt_tgt.assign(tokens.begin(), tokens.end() - 1);

    state.n_past = static_cast<int>(tokens.size()) - 1;

    llama_memory_clear(llama_get_memory(state.ctx_tgt), true);

    auto err = mtp_init_decoder(state, model_tgt, params, n_max);

    if (!err.empty()) { return err; }

    if (!mtp_prefill_and_seed(state)) { return "MTP prefill/process failed"; }

    common_speculative_begin(state.spec, state.seq_id, state.prompt_tgt);

    return "";

}


void mtp_run_loop(

    SpeculativeRunState& state, int n_max, const llama_vocab* vocab,

    int max_tokens, std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel) {

    while (state.n_generated < max_tokens) {

        if (cancel.load(std::memory_order_acquire)) {

            state.error_code = ENTROPIC_ERROR_CANCELLED;

            state.finish_reason = "cancelled";

            break;

        }

        if (!mtp_accept_round(state, n_max, vocab, max_tokens,

                              on_token, cancel)) {

            break;

        }

    }

    if (state.finish_reason.empty()) {

        state.finish_reason = (state.n_generated >= max_tokens)

                                  ? "length" : "stop";

    }

}


GenerationResult mtp_run_from_tokens(

    llama_context* ctx_tgt, llama_context* ctx_dft, llama_model* model_tgt,

    const std::vector<llama_token>& tokens, const GenerationParams& params,

    std::function<void(std::string_view)>& on_token,

    std::atomic<bool>& cancel, int n_max,

    const std::vector<std::string>& stop,

    std::chrono::steady_clock::time_point t0) {

    SpeculativeRunState state;

    state.ctx_tgt = ctx_tgt;

    state.ctx_dft = ctx_dft;

    state.stop = stop;  // gh#108: MTP honors stop sequences (effective_stop)

    auto init_err = mtp_init_run(state, model_tgt, tokens, params, n_max);

    if (!init_err.empty()) {

        spec_cleanup(state);

        return spec_error(ENTROPIC_ERROR_GENERATE_FAILED,

                          std::move(init_err));

    }

    mtp_run_loop(state, n_max, llama_model_get_vocab(model_tgt),

                 params.max_tokens, on_token, cancel);

    return spec_finalize(state, t0);

}


}  // anonymous namespace


GenerationResult LlamaCppBackend::mtp_guard(

    const GenerationParams& params,

    const std::function<void(std::string_view)>& on_token,

    const std::string& head_path, int n_max) {

    GenerationResult r;  // ENTROPIC_OK by default → proceed

    std::string reason = mtp_unsupported_reason(

        params.temperature, !params.grammar.empty(),

        static_cast<bool>(on_token));

    if (!is_active()) {

        r = spec_error(ENTROPIC_ERROR_INVALID_STATE,

                       "MTP requires an ACTIVE target");

    } else if (!reason.empty()) {

        r = spec_error(ENTROPIC_ERROR_SPECULATIVE_INCOMPATIBLE_CONFIG, reason);

    } else if (!setup_mtp_draft(head_path, n_max)) {

        r = spec_error(ENTROPIC_ERROR_LOAD_FAILED, last_error_);

    } else if (1 + mtp_n_max_ > llama_n_batch(ctx_)) {

        r = spec_error(ENTROPIC_ERROR_SPECULATIVE_INCOMPATIBLE_CONFIG,

            "speculative.n_draft+1 (" + std::to_string(1 + mtp_n_max_)

            + ") exceeds n_batch (" + std::to_string(llama_n_batch(ctx_))

            + "); reduce n_draft or raise n_batch");

    }

    return r;

}


GenerationResult LlamaCppBackend::generate_mtp(

    const std::vector<Message>& messages,

    const GenerationParams& params,

    std::function<void(std::string_view)> on_token,

    std::atomic<bool>& cancel,

    const std::string& head_path,

    int n_max)

{

    auto t0 = entropic::log::now();

    std::lock_guard<std::mutex> lk(mtp_mutex_);  // serialise vs teardown

    GenerationResult result = mtp_guard(params, on_token, head_path, n_max);

    if (result.error_code != ENTROPIC_OK) {

        return result;

    }

    invalidate_resident_kv();  // MTP kernel owns seq 0 itself

    auto tokens = tokenize(render_prompt(messages, params), true);

    if (tokens.size() < 2) {

        return spec_error(ENTROPIC_ERROR_GENERATE_FAILED,

            "MTP prompt must have at least 2 tokens");

    }

    logger->info("MTP: {} input tokens, max_tokens={}, n_max={}",

                 tokens.size(), params.max_tokens, mtp_n_max_);

    return mtp_run_from_tokens(ctx_, mtp_draft_ctx_, model_, tokens, params,

                               on_token, cancel, mtp_n_max_,

                               effective_stop(params), t0);  // gh#108: honor stops

}


GenerationResult LlamaCppBackend::do_complete(

    const std::string& prompt,

    const GenerationParams& params)

{

    auto t0 = entropic::log::now();

    invalidate_resident_kv();  // gh#96: decode_loop/run_prefill mutate seq 0

    auto tokens = tokenize(prompt, false);


    logger->info("Complete: {} input tokens, max_tokens={}",

              tokens.size(), params.max_tokens);

    log_sampler_config(params);

    auto result = decode_loop(tokens, params, nullptr, nullptr);

    finalize_result(result, t0);

    return result;

}


// ── Architecture detection (v1.9.13) ───────────────────────


bool LlamaCppBackend::is_recurrent() const {

    return is_recurrent_;

}


// ── Capability overrides (v1.9.13) ─────────────────────────


bool LlamaCppBackend::do_supports(BackendCapability cap) const {

    int idx = static_cast<int>(cap);

    int count = static_cast<int>(BackendCapability::_COUNT);

    if (idx < 0 || idx >= count) {

        return false;

    }


    // Static capabilities: true = always supported. Length must equal

    // BackendCapability::_COUNT — trailing entries get appended as new

    // capabilities are introduced (gh#53 added AUDIO at index 12).

    static constexpr bool always[] = {

        false, false, true, true, true, true,

        false, true,  true, false, false, true,

        false,  // AUDIO — dynamic only (mtmd_support_audio)

    };


    // Dynamic capabilities override the static table

    bool result = always[idx];

    if (!result) {

        result = (cap == BackendCapability::KV_CACHE && !is_recurrent())

              || (cap == BackendCapability::HIDDEN_STATE && is_recurrent())

              || (cap == BackendCapability::VISION

                  && !config().mmproj_path.empty())

              || (cap == BackendCapability::AUDIO

                  && mtmd_ctx_ != nullptr

                  && mtmd_support_audio(mtmd_ctx_))

              || (cap == BackendCapability::SPECULATIVE_DECODING

                  && !is_recurrent());

    }

    return result;

}


std::string LlamaCppBackend::do_backend_name() const {

    return "llama.cpp";

}


BackendInfo LlamaCppBackend::do_info() const {

    BackendInfo bi;

    bi.name = "llama.cpp";

#if defined(ENTROPIC_BACKEND_CUDA)

    bi.compute_device = "cuda";

#elif defined(ENTROPIC_BACKEND_VULKAN)

    bi.compute_device = "vulkan";

#else

    bi.compute_device = "cpu";

#endif

    bi.model_format = "gguf";


    if (state() != ModelState::COLD && model_ != nullptr) {

        bi.architecture = is_recurrent() ? "recurrent" : "transformer";

        bi.max_context_length =

            is_recurrent() ? -1 : config().context_length;

        bi.parameter_count = llama_model_n_params(model_);

        bi.vram_bytes = 0;

        bi.ram_bytes = llama_model_size(model_);


        char desc[256] = {};

        llama_model_desc(model_, desc, sizeof(desc));

        bi.quantization = desc;

    }

    return bi;

}


bool LlamaCppBackend::do_clear_state(int seq_id) {

    if (ctx_ == nullptr) {

        return false;

    }

    auto mem = llama_get_memory(ctx_);

    if (seq_id < 0) {

        llama_memory_clear(mem, true);

    } else {

        llama_memory_seq_rm(mem, seq_id, -1, -1);

    }

    return true;

}


bool LlamaCppBackend::do_save_state(

    int seq_id, std::vector<uint8_t>& buffer) const {

    if (ctx_ == nullptr) { return false; }

    size_t sz = llama_state_seq_get_size(

        ctx_, static_cast<llama_seq_id>(seq_id));

    if (sz == 0) { return false; }

    buffer.resize(sz);

    size_t written = llama_state_seq_get_data(

        ctx_, buffer.data(), sz,

        static_cast<llama_seq_id>(seq_id));

    return written == sz;

}


bool LlamaCppBackend::do_restore_state(

    int seq_id, const std::vector<uint8_t>& buffer) {

    if (ctx_ == nullptr || buffer.empty()) { return false; }

    size_t result = llama_state_seq_set_data(

        ctx_, buffer.data(), buffer.size(),

        static_cast<llama_seq_id>(seq_id));

    return result > 0;

}


} // namespace entropic

adapter_base.h
ChatAdapter concrete base class.

batch_util.h
gh#98 (v2.8.0) same-prefix batch-generation decision logic.

entropic::InferenceBackend::do_generate_batch
virtual std::vector< GenerationResult > do_generate_batch(const std::vector< std::vector< Message > > &requests, const std::vector< GenerationParams > &params, std::atomic< bool > &cancel)
Subclass same-prefix batch generation (gh#98, v2.8.0).
Definition backend.h:535

entropic::InferenceBackend::last_error_
std::string last_error_
Last error message for diagnostics.
Definition backend.h:726

entropic::InferenceBackend::is_active
bool is_active() const
True when state is ACTIVE.
Definition backend.h:249

entropic::InferenceBackend::state
ModelState state() const
Current lifecycle state (lock-free read).
Definition backend.h:241

entropic::InferenceBackend::config
const ModelConfig & config() const
Stored model config.
Definition backend.h:320

entropic::InferenceBackend::context_length
int context_length() const
Model's context window size.
Definition backend.h:282

entropic::InferenceBackend::state_
std::atomic< ModelState > state_
State transition slot accessible to subclasses for test-only injection.
Definition backend.h:752

entropic::LlamaCppBackend
LlamaCppBackend — common llama.cpp patterns (15% layer).
Definition llama_cpp_backend.h:65

entropic::LlamaCppBackend::parse_params_valid_
bool parse_params_valid_
True once a tooled render snapshotted.
Definition llama_cpp_backend.h:695

entropic::LlamaCppBackend::last_gen_decode_calls_
int last_gen_decode_calls_
gh#98: batched-decode step count of last batch
Definition llama_cpp_backend.h:637

entropic::LlamaCppBackend::load_gpu_model
bool load_gpu_model()
Load the GGUF model onto the GPU (do_activate step 1).
Definition llama_cpp_backend.cpp:408

entropic::LlamaCppBackend::do_load
bool do_load(const ModelConfig &config) override
Load model into CPU RAM (COLD → WARM).
Definition llama_cpp_backend.cpp:273

entropic::LlamaCppBackend::do_supports
bool do_supports(BackendCapability cap) const override
Declare llama.cpp backend capabilities.
Definition llama_cpp_backend.cpp:3978

entropic::LlamaCppBackend::build_batch_results
std::vector< GenerationResult > build_batch_results(std::vector< BatchSeq > &seqs)
Detokenize each sequence into a GenerationResult.
Definition llama_cpp_backend.cpp:1813

entropic::LlamaCppBackend::effective_stop
std::vector< std::string > effective_stop(const GenerationParams &params) const
params.stop + the sequential tool-call close marker, if applicable.
Definition llama_cpp_backend.cpp:1286

entropic::LlamaCppBackend::last_prefill_ms_
double last_prefill_ms_
gh#96: prefill wall-clock ms of last generate()
Definition llama_cpp_backend.h:639

entropic::LlamaCppBackend::last_input_tokens_
int last_input_tokens_
gh#97: tokenized prompt size of last generate()
Definition llama_cpp_backend.h:638

entropic::LlamaCppBackend::decode_loop
GenerationResult decode_loop(const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
Core decode loop — shared by generate and streaming.
Definition llama_cpp_backend.cpp:1562

entropic::LlamaCppBackend::is_recurrent_
bool is_recurrent_
True if loaded model is recurrent (GDN/Mamba/RWKV).
Definition llama_cpp_backend.h:1068

entropic::LlamaCppBackend::try_warm_reuse
bool try_warm_reuse(const std::vector< llama_token > &tokens)
gh#96 (v2.7.5): try incremental prefill against resident KV.
Definition llama_cpp_backend.cpp:2200

entropic::LlamaCppBackend::do_evaluate_logprobs
LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens) override
Evaluate per-token log-probabilities via sequential decode.
Definition llama_cpp_backend.cpp:859

entropic::LlamaCppBackend::do_backend_name
std::string do_backend_name() const override
Return backend name.
Definition llama_cpp_backend.cpp:4016

entropic::LlamaCppBackend::is_hybrid_
bool is_hybrid_
gh#97: attention + recurrent/SSM memory
Definition llama_cpp_backend.h:1069

entropic::LlamaCppBackend::do_save_state
bool do_save_state(int seq_id, std::vector< uint8_t > &buffer) const override
Capture a sequence's KV cache into a byte buffer.
Definition llama_cpp_backend.cpp:4089

entropic::LlamaCppBackend::render_prompt
std::string render_prompt(const std::vector< Message > &messages, const GenerationParams &params)
Generation render seam: common_chat-with-tools or legacy (gh#87).
Definition llama_cpp_backend.cpp:1175

entropic::LlamaCppBackend::prompt_cache_
std::unique_ptr< PromptCache > prompt_cache_
KV prefix cache (v1.8.3)
Definition llama_cpp_backend.h:674

entropic::LlamaCppBackend::teardown_mtp_draft
void teardown_mtp_draft()
Free the MTP head context + model (gh#106 lifecycle).
Definition llama_cpp_backend.cpp:529

entropic::LlamaCppBackend::parse_generation_prompt_
std::string parse_generation_prompt_
Last TOOLED render's gen prompt.
Definition llama_cpp_backend.h:693

entropic::LlamaCppBackend::run_batched_decode
std::vector< GenerationResult > run_batched_decode(const std::vector< std::vector< llama_token > > &toks, const std::vector< GenerationParams > &params, std::size_t shared, std::atomic< bool > &cancel)
Run the gh#98 multi-seq batched decode (v2.8.0).
Definition llama_cpp_backend.cpp:1849

entropic::LlamaCppBackend::do_generate
GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams &params) override
Generate a complete response using chat template.
Definition llama_cpp_backend.cpp:2535

entropic::LlamaCppBackend::reload_model_cpu_only
void reload_model_cpu_only()
Reload the model CPU-only for the WARM state (do_deactivate tail).
Definition llama_cpp_backend.cpp:663

entropic::LlamaCppBackend::render_with_tools
std::string render_with_tools(const std::vector< Message > &messages, const GenerationParams &params)
Render messages through common_chat WITH the active tools.
Definition llama_cpp_backend.cpp:1210

entropic::LlamaCppBackend::sample_batch_active
void sample_batch_active(std::vector< BatchSeq > &seqs)
Sample+accept+classify each still-active sequence.
Definition llama_cpp_backend.cpp:1757

entropic::LlamaCppBackend::common_chat_parse_reliable
bool common_chat_parse_reliable() const
True iff common_chat parsing is reliable for the last render (gh#87).
Definition llama_cpp_backend.cpp:1251

entropic::LlamaCppBackend::active_tools_json_
std::string active_tools_json_
MCP tool defs for next render.
Definition llama_cpp_backend.h:678

entropic::LlamaCppBackend::do_complete
GenerationResult do_complete(const std::string &prompt, const GenerationParams &params) override
Raw text completion without chat template.
Definition llama_cpp_backend.cpp:3941

entropic::LlamaCppBackend::last_prefill_tokens_
int last_prefill_tokens_
gh#96: prompt tokens decoded by last generate()
Definition llama_cpp_backend.h:636

entropic::LlamaCppBackend::tokenize
std::vector< llama_token > tokenize(const std::string &text, bool add_special) const
Tokenize text using model vocabulary.
Definition llama_cpp_backend.cpp:788

entropic::LlamaCppBackend::create_inference_context
bool create_inference_context()
Create the llama context + prompt cache (do_activate step 2).
Definition llama_cpp_backend.cpp:453

entropic::LlamaCppBackend::vocab_
const llama_vocab * vocab_
Vocabulary (from model_)
Definition llama_cpp_backend.h:635

entropic::LlamaCppBackend::tool_call_close_marker
std::string tool_call_close_marker() const override
Tool-call close marker for the captured chat format (gh#103).
Definition llama_cpp_backend.cpp:1267

entropic::LlamaCppBackend::have_chat_params_
bool have_chat_params_
True once a tool render captured params.
Definition llama_cpp_backend.h:685

entropic::LlamaCppBackend::compute_prefix_token_count
int compute_prefix_token_count(const std::vector< Message > &messages, const GenerationParams &params)
Compute token count of system messages only.
Definition llama_cpp_backend.cpp:2051

entropic::LlamaCppBackend::sampler_factory_
std::unique_ptr< SamplerFactory > sampler_factory_
Factory used by the decode loop to build per-generation samplers.
Definition llama_cpp_backend.h:669

entropic::LlamaCppBackend::release_temp_seqs
void release_temp_seqs(std::vector< BatchSeq > &seqs)
Release every batch sequence's temp seq_id (seq 0 excluded).
Definition llama_cpp_backend.cpp:1832

entropic::LlamaCppBackend::detokenize
std::string detokenize(llama_token token) const
Detokenize a single token.
Definition llama_cpp_backend.cpp:809

entropic::LlamaCppBackend::set_active_tools
void set_active_tools(const std::string &tools_json)
Stage tool definitions for the next common_chat render (gh#87).
Definition llama_cpp_backend.cpp:1192

entropic::LlamaCppBackend::init_mmproj_if_configured
void init_mmproj_if_configured()
Initialize the libmtmd context if mmproj is configured.
Definition llama_cpp_backend.cpp:490

entropic::LlamaCppBackend::last_chat_format_
int last_chat_format_
Captured common_chat_format.
Definition llama_cpp_backend.h:682

entropic::LlamaCppBackend::generate_speculative_with_draft
GenerationResult generate_speculative_with_draft(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, LlamaCppBackend &draft, int n_draft_max, const std::string &draft_path)
Speculative-decoding kernel with explicit draft backend.
Definition llama_cpp_backend.cpp:3571

entropic::LlamaCppBackend::ctx_
llama_context * ctx_
Inference context (ACTIVE)
Definition llama_cpp_backend.h:634

entropic::LlamaCppBackend::run_prefill
bool run_prefill(const std::vector< llama_token > &tokens)
Run batched prefill on input tokens.
Definition llama_cpp_backend.cpp:1484

entropic::LlamaCppBackend::run_sampling_loop
GenerationResult run_sampling_loop(const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel, const std::chrono::steady_clock::time_point &t0)
Sample tokens until stop / max_tokens / cancel.
Definition llama_cpp_backend.cpp:2440

entropic::LlamaCppBackend::next_temp_seq_id_
llama_seq_id next_temp_seq_id_
gh#98: monotonic high-water for NEW temp seq_ids (the old 1 + size() handed out duplicates when the p...
Definition llama_cpp_backend.h:1059

entropic::LlamaCppBackend::last_generation_prompt_
std::string last_generation_prompt_
Captured generation_prompt.
Definition llama_cpp_backend.h:683

entropic::LlamaCppBackend::mtp_guard
GenerationResult mtp_guard(const GenerationParams &params, const std::function< void(std::string_view)> &on_token, const std::string &head_path, int n_max)
Validate MTP run preconditions (gh#108, fail-fast/fail-loud).
Definition llama_cpp_backend.cpp:3872

entropic::LlamaCppBackend::restore_cached_prefix
bool restore_cached_prefix(const CacheEntry *cached, const std::vector< llama_token > &tokens)
Restore KV state from cache and decode remaining tokens.
Definition llama_cpp_backend.cpp:1997

entropic::LlamaCppBackend::save_prefix_to_cache
void save_prefix_to_cache(const CacheKey &key, int prefix_tokens)
Capture seq 0 KV state and store under the given key.
Definition llama_cpp_backend.cpp:2026

entropic::LlamaCppBackend::tokenize_text
std::vector< int32_t > tokenize_text(const std::string &text) const override
Tokenize text to token IDs using model vocabulary.
Definition llama_cpp_backend.cpp:837

entropic::LlamaCppBackend::mtp_n_max_
int mtp_n_max_
MTP draft window (n_max) of the live head.
Definition llama_cpp_backend.h:646

entropic::LlamaCppBackend::is_recurrent
bool is_recurrent() const
Check if loaded model is recurrent.
Definition llama_cpp_backend.cpp:3965

entropic::LlamaCppBackend::step_token
std::string step_token(Sampler &sampler, std::string &generated, std::function< void(std::string_view)> &on_token, const std::vector< std::string > &stop)
Generate one token and append to output.
Definition llama_cpp_backend.cpp:1521

entropic::LlamaCppBackend::generate_after_prefill
GenerationResult generate_after_prefill(Sampler &sampler, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
The post-prefill sampling loop (extracted from decode_loop).
Definition llama_cpp_backend.cpp:1604

entropic::LlamaCppBackend::mtmd_prefill
entropic_error_t mtmd_prefill(const std::string &prompt, const std::vector<::mtmd_bitmap * > &bitmaps, std::string &err_msg)
Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.
Definition llama_cpp_backend.cpp:2397

entropic::LlamaCppBackend::run_batch_gen_loop
void run_batch_gen_loop(std::vector< BatchSeq > &seqs, int max_steps, std::atomic< bool > &cancel)
Decode all sequences together until each finishes.
Definition llama_cpp_backend.cpp:1785

entropic::LlamaCppBackend::run_prefill_cached
bool run_prefill_cached(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
Run prefill with prompt cache integration.
Definition llama_cpp_backend.cpp:2134

entropic::LlamaCppBackend::mtp_head_path_
std::string mtp_head_path_
Path the live mtp_draft_ctx_ was built from.
Definition llama_cpp_backend.h:645

entropic::LlamaCppBackend::do_generate_text_only
GenerationResult do_generate_text_only(const std::vector< Message > &messages, const GenerationParams &params)
Text-only batch generation (extracted from do_generate).
Definition llama_cpp_backend.cpp:2555

entropic::LlamaCppBackend::do_restore_state
bool do_restore_state(int seq_id, const std::vector< uint8_t > &buffer) override
Restore a sequence's KV cache from a byte buffer.
Definition llama_cpp_backend.cpp:4116

entropic::LlamaCppBackend::apply_chat_template
std::string apply_chat_template(const std::vector< Message > &messages, const GenerationParams &params) const
Apply chat template to messages.
Definition llama_cpp_backend.cpp:1154

entropic::LlamaCppBackend::parse_response
CommonChatResult parse_response(const std::string &raw) const
Parse a raw model emission via the last captured render params.
Definition llama_cpp_backend.cpp:1373

entropic::LlamaCppBackend::prefill_batch_suffixes
bool prefill_batch_suffixes(std::vector< BatchSeq > &seqs, const std::vector< std::vector< llama_token > > &toks, std::size_t shared)
Prefill each request's suffix; set per-seq logits_idx.
Definition llama_cpp_backend.cpp:1721

entropic::LlamaCppBackend::tokenizer_
std::unique_ptr< Tokenizer > tokenizer_
Tokenizer used by tokenize_text / do_count_tokens / internal tokenize/detokenize.
Definition llama_cpp_backend.h:658

entropic::LlamaCppBackend::do_generate_streaming
GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Streaming generation with per-token callback.
Definition llama_cpp_backend.cpp:2696

entropic::LlamaCppBackend::has_vision_
bool has_vision_
Cached mtmd_support_vision(mtmd_ctx_) result.
Definition llama_cpp_backend.h:1091

entropic::LlamaCppBackend::inject_tokenizer_for_test
void inject_tokenizer_for_test(std::unique_ptr< Tokenizer > tokenizer)
Inject a tokenizer for unit testing (v2.3.10).
Definition llama_cpp_backend.cpp:712

entropic::LlamaCppBackend::decode_tokens_from
bool decode_tokens_from(const std::vector< llama_token > &tokens, int start_offset)
Decode tokens starting at a given offset.
Definition llama_cpp_backend.cpp:1957

entropic::LlamaCppBackend::prefill_dispatch
bool prefill_dispatch(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
Cache-aware prefill dispatch (gh#96 v2.7.5: extracted body of run_prefill_cached so the wrapper owns ...
Definition llama_cpp_backend.cpp:2258

entropic::LlamaCppBackend::release_temp_seq_id
void release_temp_seq_id(llama_seq_id seq_id)
Release a temporary sequence ID back to the pool.
Definition llama_cpp_backend.cpp:930

entropic::LlamaCppBackend::create_sampler
std::unique_ptr< Sampler > create_sampler(const GenerationParams &params) const
Build a Sampler for one generation from params.
Definition llama_cpp_backend.cpp:1468

entropic::LlamaCppBackend::do_count_tokens
int do_count_tokens(const std::string &text) const override
Count tokens in text.
Definition llama_cpp_backend.cpp:825

entropic::LlamaCppBackend::mtmd_ctx_
::mtmd_context * mtmd_ctx_
libmtmd context, or nullptr if no mmproj loaded.
Definition llama_cpp_backend.h:1087

entropic::LlamaCppBackend::last_parser_
std::string last_parser_
Captured serialized PEG arena.
Definition llama_cpp_backend.h:684

entropic::LlamaCppBackend::generate_multimodal
GenerationResult generate_multimodal(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel)
Multimodal generation core (v1.9.11 Phases 5–7).
Definition llama_cpp_backend.cpp:2484

entropic::LlamaCppBackend::seq_id_mutex_
std::mutex seq_id_mutex_
Guards temp seq_id pool (v1.9.10)
Definition llama_cpp_backend.h:1053

entropic::LlamaCppBackend::do_clear_state
bool do_clear_state(int seq_id) override
Clear KV cache or recurrent hidden state.
Definition llama_cpp_backend.cpp:4060

entropic::LlamaCppBackend::prefill_shared_and_fanout
bool prefill_shared_and_fanout(std::vector< BatchSeq > &seqs, const std::vector< llama_token > &seq0, std::size_t shared)
Prefill shared prefix into seq 0 + seq_cp fan-out.
Definition llama_cpp_backend.cpp:1698

entropic::LlamaCppBackend::apply_chat_template_lowlevel
std::string apply_chat_template_lowlevel(const std::vector< Message > &messages) const
Low-level GGUF template path (gh#86 fallback, v2.6.1).
Definition llama_cpp_backend.cpp:1420

entropic::LlamaCppBackend::extract_token_logprob
static float extract_token_logprob(const float *logits, int32_t next_token, int n_vocab)
Extract log-probability for a token from logits.
Definition llama_cpp_backend.cpp:948

entropic::LlamaCppBackend::do_deactivate
void do_deactivate() override
Deactivate: free context, reload model CPU-only.
Definition llama_cpp_backend.cpp:618

entropic::LlamaCppBackend::generate_mtp
GenerationResult generate_mtp(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, const std::string &head_path, int n_max)
Speculative generation via a target-owned MTP head (gh#106).
Definition llama_cpp_backend.cpp:3906

entropic::LlamaCppBackend::do_info
BackendInfo do_info() const override
Populate backend metadata from llama.cpp model.
Definition llama_cpp_backend.cpp:4026

entropic::LlamaCppBackend::do_generate_batch
std::vector< GenerationResult > do_generate_batch(const std::vector< std::vector< Message > > &requests, const std::vector< GenerationParams > &params, std::atomic< bool > &cancel) override
Same-prefix batch generation (gh#98, v2.8.0).
Definition llama_cpp_backend.cpp:1903

entropic::LlamaCppBackend::parse_parser_
std::string parse_parser_
Last TOOLED render's PEG arena.
Definition llama_cpp_backend.h:694

entropic::LlamaCppBackend::do_activate
bool do_activate() override
Activate model on GPU (WARM → ACTIVE).
Definition llama_cpp_backend.cpp:378

entropic::LlamaCppBackend::prepare_batch_seqs
bool prepare_batch_seqs(std::vector< BatchSeq > &seqs, const std::vector< GenerationParams > &params)
Build per-request sampler chains + seq ids.
Definition llama_cpp_backend.cpp:1679

entropic::LlamaCppBackend::build_mtp_head
bool build_mtp_head(const std::string &head_path)
Load the MTP head GGUF + create its shared-KV context (gh#106).
Definition llama_cpp_backend.cpp:568

entropic::LlamaCppBackend::prefill_and_cache_prefix
bool prefill_and_cache_prefix(const std::vector< llama_token > &tokens, int prefix_tokens, const CacheKey &key)
Two-pass prefill: prefix-only prefill → save → rest.
Definition llama_cpp_backend.cpp:2091

entropic::LlamaCppBackend::mtp_mutex_
std::mutex mtp_mutex_
gh#108: serialises MTP head setup/teardown vs in-flight generate_mtp (no deactivate-during-generate U...
Definition llama_cpp_backend.h:647

entropic::LlamaCppBackend::allocate_temp_seq_id
llama_seq_id allocate_temp_seq_id()
Allocate a temporary sequence ID for evaluation.
Definition llama_cpp_backend.cpp:914

entropic::LlamaCppBackend::prompt_cache_config_
PromptCacheConfig prompt_cache_config_
Cache config (v1.8.3)
Definition llama_cpp_backend.h:673

entropic::LlamaCppBackend::parse_chat_format_
int parse_chat_format_
Last TOOLED render's format.
Definition llama_cpp_backend.h:692

entropic::LlamaCppBackend::resident_tokens_
std::vector< llama_token > resident_tokens_
gh#96: tokens resident in KV seq 0 (warm-keep)
Definition llama_cpp_backend.h:640

entropic::LlamaCppBackend::mtp_draft_model_
llama_model * mtp_draft_model_
MTP head GGUF (separate, trunk-sharing)
Definition llama_cpp_backend.h:643

entropic::LlamaCppBackend::do_unload
void do_unload() override
Full unload — free all resources, clear prompt cache.
Definition llama_cpp_backend.cpp:744

entropic::LlamaCppBackend::model_
llama_model * model_
Loaded model (WARM+)
Definition llama_cpp_backend.h:633

entropic::LlamaCppBackend::~LlamaCppBackend
~LlamaCppBackend() override
Free llama.cpp + mtmd resources on destruction.
Definition llama_cpp_backend.cpp:695

entropic::LlamaCppBackend::invalidate_resident_kv
void invalidate_resident_kv()
gh#96 (v2.7.5): drop the warm-keep resident-KV record.
Definition llama_cpp_backend.cpp:2238

entropic::LlamaCppBackend::free_seq_ids_
std::vector< llama_seq_id > free_seq_ids_
Available temporary seq_ids (v1.9.10)
Definition llama_cpp_backend.h:1054

entropic::LlamaCppBackend::do_generate_speculative
GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel...
Definition llama_cpp_backend.cpp:2778

entropic::LlamaCppBackend::do_generate_streaming_text_only
GenerationResult do_generate_streaming_text_only(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Text-only streaming generation (extracted from streaming).
Definition llama_cpp_backend.cpp:2720

entropic::LlamaCppBackend::extract_system_prompt
static std::string extract_system_prompt(const std::vector< Message > &messages)
Extract the system prompt from messages.
Definition llama_cpp_backend.cpp:1934

entropic::LlamaCppBackend::mtp_draft_ctx_
llama_context * mtp_draft_ctx_
MTP context (ctx_type=MTP, ctx_other=ctx_)
Definition llama_cpp_backend.h:644

entropic::LlamaCppBackend::inject_sampler_factory_for_test
void inject_sampler_factory_for_test(std::unique_ptr< SamplerFactory > factory)
Inject a SamplerFactory for unit testing (v2.3.10).
Definition llama_cpp_backend.cpp:733

entropic::LlamaCppBackend::setup_mtp_draft
bool setup_mtp_draft(const std::string &head_path, int n_max)
Lazily build the MTP head context against the live ctx_ (gh#106).
Definition llama_cpp_backend.cpp:554

entropic::LlamaCppSampler
Sampler adapter that wraps a llama_sampler* chain.
Definition llama_cpp_sampler.h:47

entropic::LlamaCppSampler::native_chain
llama_sampler * native_chain() const
Expose the underlying chain for legacy call sites that have not yet been ported to the Sampler API.
Definition llama_cpp_sampler.h:88

entropic::PromptCache::make_key
static CacheKey make_key(std::string_view prompt_text, std::string_view model_path)
Compute a cache key from prompt text and model path.
Definition prompt_cache.cpp:67

entropic::Sampler
Pure-virtual per-generation sampler used by the decode loop.
Definition sampler.h:48

entropic::Sampler::sample
virtual int32_t sample()=0
Sample one token from the current decode position.

entropic_error_t
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35

ENTROPIC_OK
@ ENTROPIC_OK
Success.
Definition error.h:36

ENTROPIC_ERROR_CANCELLED
@ ENTROPIC_ERROR_CANCELLED
Operation cancelled via cancel token.
Definition error.h:48

ENTROPIC_ERROR_IMAGE_LOAD_FAILED
@ ENTROPIC_ERROR_IMAGE_LOAD_FAILED
Image file could not be read or decoded (v1.9.11)
Definition error.h:80

ENTROPIC_ERROR_SPECULATIVE_INCOMPATIBLE_CONFIG
@ ENTROPIC_ERROR_SPECULATIVE_INCOMPATIBLE_CONFIG
MTP/speculative enabled but the request can't run correctly (temp>0, grammar, tools,...
Definition error.h:90

ENTROPIC_ERROR_NOT_SUPPORTED
@ ENTROPIC_ERROR_NOT_SUPPORTED
Capability not supported by this backend (v1.9.13)
Definition error.h:84

ENTROPIC_ERROR_GENERATE_FAILED
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
Definition error.h:42

ENTROPIC_ERROR_INVALID_STATE
@ ENTROPIC_ERROR_INVALID_STATE
Operation not valid in current state (e.g., generate before activate)
Definition error.h:39

ENTROPIC_ERROR_LOAD_FAILED
@ ENTROPIC_ERROR_LOAD_FAILED
Model load failed (corrupt file, OOM, unsupported format)
Definition error.h:41

llama_cpp_backend.h
LlamaCppBackend — llama.cpp C API integration.

llama_cpp_sampler.h
Concrete llama.cpp Sampler + SamplerFactory (v2.3.10 seam impl).

llama_cpp_tokenizer.h
Concrete llama.cpp tokenizer (v2.3.10 seam impl).

logging.h
spdlog initialization and logger access.

entropic::log::now
auto now()
Get current time for timing measurements.
Definition logging.h:193

entropic::log::get
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211

entropic::log::elapsed_ms
double elapsed_ms(std::chrono::steady_clock::time_point start, std::chrono::steady_clock::time_point end)
Compute elapsed milliseconds between two time points.
Definition logging.h:203

mtp_envelope.h
Pure envelope check for the MTP speculative path (gh#108).

entropic
Activate model on GPU (WARM → ACTIVE).
Definition bundled_models.h:20

entropic::ContentPartType::IMAGE
@ IMAGE
Image content (local path or data URI)

entropic::BackendCapability
BackendCapability
Capabilities that an inference backend may or may not support.
Definition backend_capability.h:33

entropic::BackendCapability::SPECULATIVE_DECODING
@ SPECULATIVE_DECODING
Speculative decoding compatibility.

entropic::BackendCapability::HIDDEN_STATE
@ HIDDEN_STATE
Recurrent hidden state management (save/load/reset)

entropic::BackendCapability::VISION
@ VISION
Vision / multimodal input (v1.9.11)

entropic::BackendCapability::KV_CACHE
@ KV_CACHE
KV cache state management (save/load/clear)

entropic::BackendCapability::AUDIO
@ AUDIO
Audio input via mtmd audio projector (gh#53, v2.3.0)

entropic::BackendCapability::_COUNT
@ _COUNT
Sentinel — must be last. Used for iteration/array sizing.

entropic::append_sequential_stop
void append_sequential_stop(GenerationParams &params, const std::string &marker)
Append a tool-call close marker to params.stop for sequential mode.
Definition tool_call_markers.h:90

entropic::coerce_string_typed_args
void coerce_string_typed_args(std::vector< ToolCall > &calls, const std::string &tools_json)
gh#90: coerce numeric scalars back to strings for string-typed tool parameters.
Definition adapter_base.cpp:468

entropic::batch_is_viable
bool batch_is_viable(std::size_t n, int n_parallel, std::size_t shared, bool hybrid, std::size_t total_suffix, int n_batch)
Decide whether the same-prefix batch fast-path is safe + worthwhile.
Definition batch_util.h:80

entropic::has_images
bool has_images(const std::vector< ContentPart > &parts)
Check if content parts contain any image parts.
Definition content.cpp:41

entropic::spec_decode_both
static bool spec_decode_both(SpeculativeRunState &state)
Decode the speculative batch on both contexts.
Definition llama_cpp_backend.cpp:2983

entropic::BudgetMode::tokens
@ tokens
Gate on generated tokens since the last tool call.

entropic::BudgetMode::off
@ off
Disabled (default) — no thinking-budget gating.

entropic::extract_text
std::string extract_text(const std::vector< ContentPart > &parts)
Extract concatenated text from content parts.
Definition content.cpp:20

entropic::close_marker_for_format
std::string close_marker_for_format(common_chat_format fmt)
Map a resolved common_chat format to its single-tool-call close marker.
Definition tool_call_markers.h:49

entropic::spec_ckpt_save_tgt
static void spec_ckpt_save_tgt(SpeculativeRunState &state)
Snapshot target state right before the target decode of the speculative batch (when use_ckpt_tgt + no...
Definition llama_cpp_backend.cpp:3109

entropic::ToolResultKind::ok
@ ok
Tool dispatched, returned non-empty content.

entropic::batch_shared_prefix_len
std::size_t batch_shared_prefix_len(const std::vector< std::vector< Tok > > &seqs)
Longest shared token prefix across N request sequences (gh#98).
Definition batch_util.h:43

entropic::spec_trim_rejected_drafts
static void spec_trim_rejected_drafts(SpeculativeRunState &state)
Clear any stale KV positions left by rejected draft tokens.
Definition llama_cpp_backend.cpp:3185

entropic::spec_commit_accepted
static bool spec_commit_accepted(SpeculativeRunState &state, const std::vector< llama_token > &ids, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Walk accepted ids, emit tokens via callback, update state.
Definition llama_cpp_backend.cpp:3198

entropic::mcp_tools_to_common_chat
static std::vector< common_chat_tool > mcp_tools_to_common_chat(const std::string &tools_json)
Convert entropic MCP tool JSON to common_chat_tool defs (gh#87).
Definition llama_cpp_backend.cpp:1026

entropic::spec_emit_token
static std::string spec_emit_token(SpeculativeRunState &state, llama_token id, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Emit on_token for one accepted id, updating state and returning a stop signal when terminating condit...
Definition llama_cpp_backend.cpp:3040

entropic::warm_keep_cut
std::size_t warm_keep_cut(const std::vector< Tok > &resident, const std::vector< Tok > &incoming, long kv_pos_max)
Decide how many resident-KV tokens warm-keep may reuse this turn.
Definition warm_keep_util.h:62

entropic::spec_rollback_partial
static void spec_rollback_partial(SpeculativeRunState &state, common_sampler *smpl_save, std::vector< llama_token > &ids)
Partial-acceptance rollback: restore both contexts and the sampler to their pre-draft state,...
Definition llama_cpp_backend.cpp:3142

entropic::fill_batch_cell
static void fill_batch_cell(llama_batch &b, int k, llama_token tok, llama_pos pos, llama_seq_id seq, bool want_logits)
Fill one cell of a multi-seq llama_batch.
Definition llama_cpp_backend.cpp:1664

entropic::spec_check_preconditions
static std::string spec_check_preconditions(bool target_active, bool draft_active, llama_context *ctx_tgt, llama_context *ctx_dft)
Validate speculative preconditions and reject NO-seq_rm.
Definition llama_cpp_backend.cpp:3310

entropic::ModelState::WARM
@ WARM
mmap'd + mlock'd in RAM

entropic::ModelState::COLD
@ COLD
On disk only, no RAM consumed.

entropic::strip_thinking_channels
void strip_thinking_channels(std::string &content, std::string *reasoning_out)
Strip Gemma 4 QAT reasoning channels (<|channel>…<channel|>) from content, accumulating the stripped ...
Definition llama_cpp_backend.cpp:1323

entropic::render_common_chat
static std::optional< common_chat_params > render_common_chat(llama_model *model, const std::vector< Message > &messages, const GenerationParams &params, const std::vector< common_chat_tool > &tools)
Shared common_chat render core for both template paths (gh#87).
Definition llama_cpp_backend.cpp:1090

entropic::concat_messages_fallback
static std::string concat_messages_fallback(const std::vector< Message > &messages)
Plain "role: content" join used when templating fails.
Definition llama_cpp_backend.cpp:1125

entropic::spec_run_draft
static int spec_run_draft(SpeculativeRunState &state)
Trigger draft generation via common_speculative_draft.
Definition llama_cpp_backend.cpp:3014

entropic::spec_run_loop
static void spec_run_loop(SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Run the accept-round loop until completion / EOS / cancel.
Definition llama_cpp_backend.cpp:3436

entropic::mtp_unsupported_reason
std::string mtp_unsupported_reason(float temperature, bool has_grammar, bool streaming)
Reason MTP cannot run for a request, or "" when the envelope is safe.
Definition mtp_envelope.h:75

entropic::to_common_chat
static std::vector< common_chat_msg > to_common_chat(const std::vector< Message > &messages)
Convert engine messages to common_chat_msg (gh#86, v2.6.1).
Definition llama_cpp_backend.cpp:999

entropic::to_entropic_tool_call
static ToolCall to_entropic_tool_call(const common_chat_tool_call &cc)
Map a common_chat_tool_call to entropic's ToolCall (gh#87).
Definition llama_cpp_backend.cpp:1056

entropic::to_llama_chat
static std::vector< llama_chat_message > to_llama_chat(const std::vector< Message > &messages)
Convert engine messages to llama_chat_message views.
Definition llama_cpp_backend.cpp:976

entropic::spec_finalize
static GenerationResult spec_finalize(SpeculativeRunState &state, std::chrono::steady_clock::time_point t0)
Speculative kernel against an explicit draft backend.
Definition llama_cpp_backend.cpp:3469

entropic::spec_accept_round
static bool spec_accept_round(SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
Run one speculative accept round; return false to stop.
Definition llama_cpp_backend.cpp:3251

entropic::spec_run_from_tokens
static GenerationResult spec_run_from_tokens(llama_context *ctx_tgt, llama_context *ctx_dft, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel, int n_draft_max, const std::string &draft_path, std::chrono::steady_clock::time_point t0)
Public entry point for the speculative-decoding kernel.
Definition llama_cpp_backend.cpp:3544

entropic::batch_error_result
static GenerationResult batch_error_result(const std::string &msg)
Build a single error GenerationResult (gh#98 batch failures).
Definition llama_cpp_backend.cpp:1651

entropic::spec_build_batch
static void spec_build_batch(SpeculativeRunState &state)
Build the target batch [id_last, draft0, ..., draftN-1].
Definition llama_cpp_backend.cpp:2964

entropic::spec_prepare_draft
static int spec_prepare_draft(SpeculativeRunState &state)
Drive one accept round: optional draft generation, decode on both contexts, sample-and-accept,...
Definition llama_cpp_backend.cpp:3233

entropic::spec_ckpt_save_dft
static void spec_ckpt_save_dft(SpeculativeRunState &state)
Drive one accept round: draft → decode → sample-and-accept → emit tokens.
Definition llama_cpp_backend.cpp:3089

entropic::spec_cleanup
static void spec_cleanup(SpeculativeRunState &state)
Free everything allocated by the kernel.
Definition llama_cpp_backend.cpp:2950

entropic::spec_init_sampler_and_decoder
static std::string spec_init_sampler_and_decoder(SpeculativeRunState &state, llama_model *model_tgt, const GenerationParams &params, int n_draft_max, const std::string &draft_path)
Initialize the kernel state: clear KV, prefill, sampler, speculative context, batch,...
Definition llama_cpp_backend.cpp:3369

entropic::spec_ckpt_restore_dft
static void spec_ckpt_restore_dft(SpeculativeRunState &state)
Restore the draft's pre-draft state so the upcoming target-batch decode on the draft re-fills cleanly...
Definition llama_cpp_backend.cpp:3123

entropic::spec_init_run
static std::string spec_init_run(SpeculativeRunState &state, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams &params, int n_draft_max, const std::string &draft_path)
Initialize speculative run state (prefill + sampler + decoder).
Definition llama_cpp_backend.cpp:3411

entropic::BackendInfo
Backend metadata for introspection.
Definition backend_capability.h:58

entropic::BackendInfo::ram_bytes
size_t ram_bytes
RAM consumed by loaded model (bytes). 0 if COLD.
Definition backend_capability.h:77

entropic::BackendInfo::max_context_length
int max_context_length
Maximum context length.
Definition backend_capability.h:74

entropic::BackendInfo::parameter_count
size_t parameter_count
Number of parameters (from model metadata).
Definition backend_capability.h:78

entropic::BackendInfo::architecture
std::string architecture
Architecture family of the loaded model.
Definition backend_capability.h:68

entropic::BackendInfo::compute_device
std::string compute_device
"cuda", "vulkan", "cpu", "npu"
Definition backend_capability.h:61

entropic::BackendInfo::name
std::string name
Backend identifier (e.g. "llama.cpp", "axcl")
Definition backend_capability.h:59

entropic::BackendInfo::quantization
std::string quantization
Quantization type (e.g. "IQ3_XXS", "Q8_0", "fp16").
Definition backend_capability.h:79

entropic::BackendInfo::vram_bytes
size_t vram_bytes
VRAM consumed by loaded model (bytes). 0 if COLD.
Definition backend_capability.h:76

entropic::BackendInfo::model_format
std::string model_format
"gguf", "axmodel", "onnx", etc.
Definition backend_capability.h:62

entropic::CacheEntry
Single cached KV state snapshot.
Definition prompt_cache.h:62

entropic::CacheEntry::data
std::vector< uint8_t > data
Raw KV cache bytes.
Definition prompt_cache.h:63

entropic::CacheEntry::data_size
size_t data_size
data.size() for quick byte accounting
Definition prompt_cache.h:65

entropic::CacheEntry::token_count
int token_count
Prompt tokens covered by this entry.
Definition prompt_cache.h:64

entropic::CacheKey
64-bit hash used as cache lookup key.
Definition prompt_cache.h:38

entropic::GenerationParams
Generation parameters for a single inference call.
Definition config.h:302

entropic::GenerationParams::grammar
std::string grammar
GBNF grammar string (empty = unconstrained)
Definition config.h:359

entropic::GenerationParams::top_k
int top_k
Top-K sampling.
Definition config.h:305

entropic::GenerationParams::repeat_penalty
float repeat_penalty
Repetition penalty.
Definition config.h:306

entropic::GenerationParams::temperature
float temperature
Sampling temperature.
Definition config.h:303

entropic::GenerationParams::frequency_penalty
float frequency_penalty
Frequency-penalty term in llama.cpp's penalties sampler (gh#23 MVP item 3).
Definition config.h:349

entropic::GenerationParams::presence_penalty
float presence_penalty
Presence-penalty term in llama.cpp's penalties sampler (gh#23 MVP item 2).
Definition config.h:322

entropic::GenerationParams::enable_thinking
bool enable_thinking
Enable <think> blocks (false if reasoning_budget == 0)
Definition config.h:358

entropic::GenerationParams::min_p
float min_p
Min-p nucleus sampling threshold (gh#23 MVP item 1).
Definition config.h:315

entropic::GenerationParams::max_tokens
int max_tokens
Maximum tokens to generate.
Definition config.h:351

entropic::GenerationParams::top_p
float top_p
Nucleus sampling threshold.
Definition config.h:304

entropic::GenerationParams::seed
int seed
RNG seed for reproducible sampling.
Definition config.h:356

entropic::GenerationParams::stop
std::vector< std::string > stop
Stop sequences.
Definition config.h:365

entropic::GenerationResult
Result of a single generation call.
Definition generation_result.h:30

entropic::GenerationResult::error_code
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
Definition generation_result.h:82

entropic::GenerationResult::generation_time_ms
double generation_time_ms
Wall-clock generation time.
Definition generation_result.h:36

entropic::GenerationResult::n_drafted
int n_drafted
Tokens proposed by the draft/MTP head across all rounds.
Definition generation_result.h:74

entropic::GenerationResult::seq_id
int seq_id
Sequence identifier for multi-sequence backends.
Definition generation_result.h:66

entropic::GenerationResult::throughput_tok_s
double throughput_tok_s
Measured throughput for this generation (tok/s).
Definition generation_result.h:48

entropic::GenerationResult::finish_reason
std::string finish_reason
Finish reason: "stop", "length", "error".
Definition generation_result.h:34

entropic::GenerationResult::content
std::string content
Generated text (cleaned by adapter)
Definition generation_result.h:31

entropic::GenerationResult::n_accepted
int n_accepted
Draft tokens the target accepted (≤ n_drafted).
Definition generation_result.h:79

entropic::GenerationResult::error_message
std::string error_message
Error description (empty if no error)
Definition generation_result.h:83

entropic::GenerationResult::token_count
int token_count
Generated token count.
Definition generation_result.h:35

entropic::LlamaCppBackend::CommonChatResult
Result of a common_chat parse: native tool calls + split content.
Definition llama_cpp_backend.h:304

entropic::LlamaCppBackend::CommonChatResult::tool_calls
std::vector< ToolCall > tool_calls
Extracted native tool calls.
Definition llama_cpp_backend.h:305

entropic::LlamaCppBackend::CommonChatResult::content
std::string content
Content with calls + reasoning removed.
Definition llama_cpp_backend.h:306

entropic::LlamaCppBackend::CommonChatResult::reasoning_content
std::string reasoning_content
Extracted reasoning/thought block.
Definition llama_cpp_backend.h:307

entropic::LogprobResult
Per-token log-probability evaluation result.
Definition logprob_result.h:36

entropic::LogprobResult::logprobs
std::vector< float > logprobs
Log-prob for each token transition (N-1 values)
Definition logprob_result.h:37

entropic::LogprobResult::n_logprobs
int n_logprobs
Number of logprob values (n_tokens - 1)
Definition logprob_result.h:42

entropic::LogprobResult::n_tokens
int n_tokens
Number of input tokens.
Definition logprob_result.h:41

entropic::LogprobResult::total_logprob
float total_logprob
Sum of all logprob values.
Definition logprob_result.h:40

entropic::LogprobResult::perplexity
float perplexity
exp(-mean(logprobs)) — lower = less surprising
Definition logprob_result.h:39

entropic::LogprobResult::tokens
std::vector< int32_t > tokens
Input tokens echoed back for verification.
Definition logprob_result.h:38

entropic::ModelConfig
Model configuration for a single tier.
Definition config.h:148

entropic::ModelConfig::mmproj_path
std::filesystem::path mmproj_path
Vision projector GGUF path.
Definition config.h:244

entropic::ModelConfig::gpu_layers
int gpu_layers
GPU offload layers (-1 = all)
Definition config.h:152

entropic::ModelConfig::n_ubatch
int n_ubatch
Physical micro-batch size for prompt processing (gh#23 MVP item 5).
Definition config.h:172

entropic::ModelConfig::context_length
int context_length
Context window size (512–131072)
Definition config.h:151

entropic::ModelConfig::path
std::filesystem::path path
Resolved model file path.
Definition config.h:149

entropic::ModelConfig::rope_freq_scale
float rope_freq_scale
RoPE frequency scaling factor (gh#23 MVP item 10).
Definition config.h:222

entropic::ModelConfig::main_gpu
int main_gpu
Primary GPU index for model load (gh#23 MVP item 7).
Definition config.h:194

entropic::ModelConfig::n_threads
int n_threads
CPU threads (0 = auto-detect)
Definition config.h:174

entropic::ModelConfig::offload_kqv
bool offload_kqv
Offload KQV ops (incl.
Definition config.h:202

entropic::ModelConfig::n_parallel
int n_parallel
Max parallel sequences per context (gh#23 MVP item 11).
Definition config.h:232

entropic::ModelConfig::cache_type_k
std::string cache_type_k
KV cache key quantization type.
Definition config.h:158

entropic::ModelConfig::cache_type_v
std::string cache_type_v
KV cache value quantization type.
Definition config.h:159

entropic::ModelConfig::split_mode
std::string split_mode
Multi-GPU split mode for model load (gh#23 MVP item 6).
Definition config.h:186

entropic::ModelConfig::n_batch
int n_batch
Batch size for prompt processing.
Definition config.h:160

entropic::ModelConfig::flash_attn
bool flash_attn
Enable flash attention.
Definition config.h:233

entropic::ModelConfig::use_mlock
bool use_mlock
Lock model in system RAM.
Definition config.h:154

entropic::ModelConfig::rope_freq_base
float rope_freq_base
RoPE base frequency override (gh#23 MVP item 9).
Definition config.h:212

entropic::PromptCacheConfig::max_bytes
size_t max_bytes
Maximum cache RAM (512 MB default)
Definition config.h:267

entropic::PromptCacheConfig::log_hits
bool log_hits
Log cache hit/miss at INFO level.
Definition config.h:269

entropic::PromptCacheConfig::enabled
bool enabled
Master switch (false = no caching)
Definition config.h:268

entropic::PromptCacheConfig::warm_keep
bool warm_keep
gh#96 (v2.7.5): keep the prior turn's KV resident and re-decode only the appended delta (warm-keep / ...
Definition config.h:274

entropic::SpeculativeRunState
Bundles per-kernel-run mutable state to keep the loop body focused on its responsibility (knots: cogn...
Definition llama_cpp_backend.cpp:2910

entropic::SpeculativeRunState::stop
std::vector< std::string > stop
gh#108: stop seqs (effective_stop); empty for gh#36
Definition llama_cpp_backend.cpp:2923

entropic::ToolCall
A tool call request parsed from model output.
Definition tool_call.h:31

entropic::ToolCall::arguments
std::unordered_map< std::string, std::string > arguments
Tool arguments as string key-value pairs.
Definition tool_call.h:34

entropic::ToolCall::id
std::string id
Unique call ID (UUID)
Definition tool_call.h:32

entropic::ToolCall::arguments_json
std::string arguments_json
Original JSON string (for passthrough dispatch)
Definition tool_call.h:35

entropic::ToolCall::name
std::string name
Tool name (e.g. "filesystem.read_file")
Definition tool_call.h:33

tool_call_markers.h
gh#103 (v2.8.2): family-aware tool-call CLOSE markers, derived from the resolved common_chat format.

warm_keep_util.h
gh#96 (v2.7.5) warm-keep / incremental-prefill decision logic.