entropic/llama__cpp__backend_8h_source.html

// SPDX-License-Identifier: Apache-2.0

#pragma once


#include <entropic/inference/backend.h>

#include <entropic/inference/sampler.h>

#include <entropic/inference/tokenizer.h>


#include "prompt_cache.h"


#include <llama.h>


#include <atomic>

#include <chrono>

#include <cstdint>

#include <functional>

#include <memory>

#include <mutex>

#include <string>

#include <vector>


// Forward-declare libmtmd's opaque types at file scope so they

// resolve to ::mtmd_context / ::mtmd_bitmap (not entropic::mtmd_*)

// when referenced inside the class body. Full types live in

// extern/llama.cpp/tools/mtmd/mtmd.h and are only included from

// the implementation file. (v2.1.8, gh#37/v1.9.11 Phase 5)

extern "C" {

struct mtmd_context;

struct mtmd_bitmap;

}


namespace entropic {


class LlamaCppBackend : public InferenceBackend {

public:

    ~LlamaCppBackend() override;


    void inject_tokenizer_for_test(std::unique_ptr<Tokenizer> tokenizer);


    void inject_sampler_factory_for_test(

        std::unique_ptr<SamplerFactory> factory);


    SamplerFactory* sampler_factory_for_test() const {

        return sampler_factory_.get();

    }


    llama_seq_id allocate_temp_seq_id_for_test() {

        return allocate_temp_seq_id();

    }


    void release_temp_seq_id_for_test(llama_seq_id id) {

        release_temp_seq_id(id);

    }


    void set_prompt_cache_config(const PromptCacheConfig& config) {

        prompt_cache_config_ = config;

    }


    void clear_prompt_cache() override {

        if (prompt_cache_) { prompt_cache_->clear(); }

    }


    std::vector<int32_t> tokenize_text(

        const std::string& text) const override;


    /* ── llama.cpp handle accessors (v1.9.2) ────────────── */


    llama_model* llama_model_ptr() { return model_; }


    llama_context* llama_context_ptr() { return ctx_; }


    int last_prefill_tokens() const { return last_prefill_tokens_; }


    int last_gen_decode_calls() const { return last_gen_decode_calls_; }


    double last_prefill_ms() const { return last_prefill_ms_; }


    int last_input_tokens() const { return last_input_tokens_; }


    int kv_pos_max() const {

        return ctx_ != nullptr

            ? static_cast<int>(llama_memory_seq_pos_max(llama_get_memory(ctx_), 0))

            : -1;

    }


    /* ── gh#87 (v2.7.0): common_chat tool-call render + parse ── */


    struct CommonChatResult {

        std::vector<ToolCall> tool_calls;

        std::string content;

        std::string reasoning_content;

    };


    void set_active_tools(const std::string& tools_json);


    std::string render_with_tools(

        const std::vector<Message>& messages,

        const GenerationParams& params);


    CommonChatResult parse_response(const std::string& raw) const;


    bool has_common_chat_params() const { return have_chat_params_; }


    bool common_chat_parse_reliable() const;


    std::string tool_call_close_marker() const override;


    std::vector<std::string> effective_stop(

        const GenerationParams& params) const;


protected:

    /* ── Lifecycle overrides ─────────────────────────────── */


    bool do_load(const ModelConfig& config) override;

    bool do_activate() override;

    void do_deactivate() override;

    void do_unload() override;


    /* ── Generation overrides ────────────────────────────── */


    GenerationResult do_generate(

        const std::vector<Message>& messages,

        const GenerationParams& params) override;


    GenerationResult do_generate(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::atomic<bool>& cancel) override;


    GenerationResult do_generate_streaming(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view token)> on_token,

        std::atomic<bool>& cancel) override;


    GenerationResult do_generate_speculative(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view token)> on_token,

        std::atomic<bool>& cancel) override;


    std::vector<GenerationResult> do_generate_batch(

        const std::vector<std::vector<Message>>& requests,

        const std::vector<GenerationParams>& params,

        std::atomic<bool>& cancel) override;


public:

    GenerationResult generate_speculative_with_draft(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view token)> on_token,

        std::atomic<bool>& cancel,

        LlamaCppBackend& draft,

        int n_draft_max,

        const std::string& draft_path);


    GenerationResult generate_mtp(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view token)> on_token,

        std::atomic<bool>& cancel,

        const std::string& head_path,

        int n_max);


    bool mtp_active() const { return mtp_draft_ctx_ != nullptr; }


protected:

    GenerationResult do_complete(

        const std::string& prompt,

        const GenerationParams& params) override;


    int do_count_tokens(const std::string& text) const override;


    /* ── Evaluation override (v1.9.10) ──────────────────── */


    LogprobResult do_evaluate_logprobs(

        const int32_t* tokens,

        int n_tokens) override;


    /* ── Capability overrides (v1.9.13) ──────────────────── */


    bool do_supports(BackendCapability cap) const override;

    std::string do_backend_name() const override;

    BackendInfo do_info() const override;

    bool do_clear_state(int seq_id) override;


    /* ── State save/load override (gh#23 MVP item 13, v2.3.25) ── */


    bool do_save_state(int seq_id,

                       std::vector<uint8_t>& buffer) const override;


    bool do_restore_state(int seq_id,

                          const std::vector<uint8_t>& buffer) override;


    /* ── llama.cpp handles ───────────────────────────────── */


    llama_model* model_ = nullptr;

    llama_context* ctx_ = nullptr;

    const llama_vocab* vocab_ = nullptr;

    int last_prefill_tokens_ = 0;

    int last_gen_decode_calls_ = 0;

    int last_input_tokens_ = 0;

    double last_prefill_ms_ = 0.0;

    std::vector<llama_token> resident_tokens_;


    /* ── gh#106 (v2.9.0): MTP draft head (target-owned, shared-KV) ── */

    llama_model* mtp_draft_model_ = nullptr;

    llama_context* mtp_draft_ctx_ = nullptr;

    std::string mtp_head_path_;

    int mtp_n_max_ = 16;

    std::mutex mtp_mutex_;


    /* ── v2.3.10 seam: tokenizer abstraction ─────────────── */


    std::unique_ptr<Tokenizer> tokenizer_;


    /* ── v2.3.10 seam: sampler abstraction ───────────────── */


    std::unique_ptr<SamplerFactory> sampler_factory_;


    /* ── Prompt cache ───────────────────────────────────── */


    PromptCacheConfig prompt_cache_config_;

    std::unique_ptr<PromptCache> prompt_cache_;


    /* ── gh#87 (v2.7.0): common_chat tool-call render/parse state ─ */


    std::string active_tools_json_;

    // LIVE capture — overwritten by EVERY render (incl. a toolless interleave

    // like the constitutional validator's critique). Serves has_common_chat_

    // params() / tool_call_close_marker() — "what THIS render produced".

    int last_chat_format_ = 0;

    std::string last_generation_prompt_;

    std::string last_parser_;

    bool have_chat_params_ = false;

    // gh#105 (v2.8.3): "sticky last-tooled" parse snapshot — written ONLY by a

    // successful render_with_tools, NEVER cleared by a toolless render. The

    // engine RE-parses the main output (engine.cpp:543) AFTER the validator's

    // toolless critique render; parse_response/common_chat_parse_reliable read

    // THIS so that interleave can't clobber the main call's parser → no more

    // zero-tool-call extraction with constitutional validation on.

    int parse_chat_format_ = 0;

    std::string parse_generation_prompt_;

    std::string parse_parser_;

    bool parse_params_valid_ = false;


    /* ── Internal helpers ────────────────────────────────── */


    std::vector<llama_token> tokenize(

        const std::string& text, bool add_special) const;


    std::string detokenize(llama_token token) const;


    std::string apply_chat_template(

        const std::vector<Message>& messages,

        const GenerationParams& params) const;


    std::string render_prompt(

        const std::vector<Message>& messages,

        const GenerationParams& params);


    std::string apply_chat_template_lowlevel(

        const std::vector<Message>& messages) const;


    GenerationResult decode_loop(

        const std::vector<llama_token>& tokens,

        const GenerationParams& params,

        std::function<void(std::string_view)> on_token,

        std::atomic<bool>* cancel);


    GenerationResult generate_after_prefill(

        Sampler& sampler,

        const GenerationParams& params,

        std::function<void(std::string_view)> on_token,

        std::atomic<bool>* cancel);


    struct BatchSeq {

        std::unique_ptr<Sampler> sampler;

        llama_sampler* chain = nullptr;

        llama_seq_id seq_id = 0;

        int pos = 0;

        int logits_idx = -1;

        int n_gen = 0;

        int max_tokens = 0;

        bool active = true;

        std::vector<llama_token> out;

        std::string finish = "stop";

    };


    std::vector<GenerationResult> run_batched_decode(

        const std::vector<std::vector<llama_token>>& toks,

        const std::vector<GenerationParams>& params,

        std::size_t shared,

        std::atomic<bool>& cancel);


    bool prepare_batch_seqs(std::vector<BatchSeq>& seqs,

                            const std::vector<GenerationParams>& params);

    bool prefill_shared_and_fanout(std::vector<BatchSeq>& seqs,

                                   const std::vector<llama_token>& seq0,

                                   std::size_t shared);

    bool prefill_batch_suffixes(

        std::vector<BatchSeq>& seqs,

        const std::vector<std::vector<llama_token>>& toks,

        std::size_t shared);

    void run_batch_gen_loop(std::vector<BatchSeq>& seqs, int max_steps,

                            std::atomic<bool>& cancel);

    void sample_batch_active(std::vector<BatchSeq>& seqs);

    std::vector<GenerationResult> build_batch_results(

        std::vector<BatchSeq>& seqs);

    void release_temp_seqs(std::vector<BatchSeq>& seqs);


    bool run_prefill(const std::vector<llama_token>& tokens);


    std::string step_token(

        Sampler& sampler,

        std::string& generated,

        std::function<void(std::string_view)>& on_token,

        const std::vector<std::string>& stop);


    std::unique_ptr<Sampler> create_sampler(

        const GenerationParams& params) const;


    static std::string extract_system_prompt(

        const std::vector<Message>& messages);


    bool run_prefill_cached(

        const std::vector<llama_token>& tokens,

        const std::string& system_prompt,

        const std::vector<Message>& messages,

        const GenerationParams& params);


    bool prefill_dispatch(

        const std::vector<llama_token>& tokens,

        const std::string& system_prompt,

        const std::vector<Message>& messages,

        const GenerationParams& params);


    bool try_warm_reuse(const std::vector<llama_token>& tokens);


    void invalidate_resident_kv();


    bool decode_tokens_from(

        const std::vector<llama_token>& tokens, int start_offset);


    bool restore_cached_prefix(

        const CacheEntry* cached,

        const std::vector<llama_token>& tokens);


    bool prefill_and_cache_prefix(

        const std::vector<llama_token>& tokens,

        int prefix_tokens,

        const CacheKey& key);


    void save_prefix_to_cache(const CacheKey& key, int prefix_tokens);


    int compute_prefix_token_count(

        const std::vector<Message>& messages,

        const GenerationParams& params);


    /* ── Evaluation helpers (v1.9.10) ───────────────────── */


    llama_seq_id allocate_temp_seq_id();


    void release_temp_seq_id(llama_seq_id seq_id);


    static float extract_token_logprob(

        const float* logits,

        int32_t next_token,

        int n_vocab);


    std::mutex seq_id_mutex_;

    std::vector<llama_seq_id> free_seq_ids_;

    llama_seq_id next_temp_seq_id_ = 1;


    /* ── Architecture detection (v1.9.13) ──────────────── */


    bool is_recurrent_ = false;

    bool is_hybrid_ = false;


    bool is_recurrent() const;


    /* ── Vision / multimodal (v1.9.11 Phases 5–7 + v2.1.8) ── */


    ::mtmd_context* mtmd_ctx_ = nullptr;


    bool has_vision_ = false;


    GenerationResult generate_multimodal(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view token)> on_token,

        std::atomic<bool>* cancel);


    void init_mmproj_if_configured();


    bool load_gpu_model();


    bool create_inference_context();


    bool setup_mtp_draft(const std::string& head_path, int n_max);


    bool build_mtp_head(const std::string& head_path);


    void teardown_mtp_draft();


    void reload_model_cpu_only();


    GenerationResult mtp_guard(

        const GenerationParams& params,

        const std::function<void(std::string_view)>& on_token,

        const std::string& head_path, int n_max);


    entropic_error_t mtmd_prefill(

        const std::string& prompt,

        const std::vector<::mtmd_bitmap*>& bitmaps,

        std::string& err_msg);


    GenerationResult run_sampling_loop(

        const GenerationParams& params,

        std::function<void(std::string_view token)> on_token,

        std::atomic<bool>* cancel,

        const std::chrono::steady_clock::time_point& t0);


    GenerationResult do_generate_text_only(

        const std::vector<Message>& messages,

        const GenerationParams& params);


    GenerationResult do_generate_text_only(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::atomic<bool>& cancel);


    GenerationResult do_generate_streaming_text_only(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view token)> on_token,

        std::atomic<bool>& cancel);

};


} // namespace entropic

entropic::InferenceBackend
Concrete base class for inference backends (80% logic).
Definition backend.h:69

entropic::InferenceBackend::config
const ModelConfig & config() const
Stored model config.
Definition backend.h:320

entropic::LlamaCppBackend
LlamaCppBackend — common llama.cpp patterns (15% layer).
Definition llama_cpp_backend.h:65

entropic::LlamaCppBackend::parse_params_valid_
bool parse_params_valid_
True once a tooled render snapshotted.
Definition llama_cpp_backend.h:695

entropic::LlamaCppBackend::last_gen_decode_calls_
int last_gen_decode_calls_
gh#98: batched-decode step count of last batch
Definition llama_cpp_backend.h:637

entropic::LlamaCppBackend::load_gpu_model
bool load_gpu_model()
Load the GGUF model onto the GPU (do_activate step 1).
Definition llama_cpp_backend.cpp:408

entropic::LlamaCppBackend::do_load
bool do_load(const ModelConfig &config) override
Load model into CPU RAM (COLD → WARM).
Definition llama_cpp_backend.cpp:273

entropic::LlamaCppBackend::do_supports
bool do_supports(BackendCapability cap) const override
Declare llama.cpp backend capabilities.
Definition llama_cpp_backend.cpp:3978

entropic::LlamaCppBackend::build_batch_results
std::vector< GenerationResult > build_batch_results(std::vector< BatchSeq > &seqs)
Detokenize each sequence into a GenerationResult.
Definition llama_cpp_backend.cpp:1813

entropic::LlamaCppBackend::effective_stop
std::vector< std::string > effective_stop(const GenerationParams &params) const
params.stop + the sequential tool-call close marker, if applicable.
Definition llama_cpp_backend.cpp:1286

entropic::LlamaCppBackend::last_prefill_ms_
double last_prefill_ms_
gh#96: prefill wall-clock ms of last generate()
Definition llama_cpp_backend.h:639

entropic::LlamaCppBackend::last_input_tokens_
int last_input_tokens_
gh#97: tokenized prompt size of last generate()
Definition llama_cpp_backend.h:638

entropic::LlamaCppBackend::decode_loop
GenerationResult decode_loop(const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
Core decode loop — shared by generate and streaming.
Definition llama_cpp_backend.cpp:1562

entropic::LlamaCppBackend::clear_prompt_cache
void clear_prompt_cache() override
Drop every cached prefix so the next prefill re-seeds.
Definition llama_cpp_backend.h:189

entropic::LlamaCppBackend::is_recurrent_
bool is_recurrent_
True if loaded model is recurrent (GDN/Mamba/RWKV).
Definition llama_cpp_backend.h:1068

entropic::LlamaCppBackend::try_warm_reuse
bool try_warm_reuse(const std::vector< llama_token > &tokens)
gh#96 (v2.7.5): try incremental prefill against resident KV.
Definition llama_cpp_backend.cpp:2200

entropic::LlamaCppBackend::do_evaluate_logprobs
LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens) override
Evaluate per-token log-probabilities via sequential decode.
Definition llama_cpp_backend.cpp:859

entropic::LlamaCppBackend::do_backend_name
std::string do_backend_name() const override
Return backend name.
Definition llama_cpp_backend.cpp:4016

entropic::LlamaCppBackend::is_hybrid_
bool is_hybrid_
gh#97: attention + recurrent/SSM memory
Definition llama_cpp_backend.h:1069

entropic::LlamaCppBackend::do_save_state
bool do_save_state(int seq_id, std::vector< uint8_t > &buffer) const override
Capture a sequence's KV cache into a byte buffer.
Definition llama_cpp_backend.cpp:4089

entropic::LlamaCppBackend::render_prompt
std::string render_prompt(const std::vector< Message > &messages, const GenerationParams &params)
Generation render seam: common_chat-with-tools or legacy (gh#87).
Definition llama_cpp_backend.cpp:1175

entropic::LlamaCppBackend::prompt_cache_
std::unique_ptr< PromptCache > prompt_cache_
KV prefix cache (v1.8.3)
Definition llama_cpp_backend.h:674

entropic::LlamaCppBackend::teardown_mtp_draft
void teardown_mtp_draft()
Free the MTP head context + model (gh#106 lifecycle).
Definition llama_cpp_backend.cpp:529

entropic::LlamaCppBackend::llama_context_ptr
llama_context * llama_context_ptr()
Get the active llama_context pointer.
Definition llama_cpp_backend.h:218

entropic::LlamaCppBackend::parse_generation_prompt_
std::string parse_generation_prompt_
Last TOOLED render's gen prompt.
Definition llama_cpp_backend.h:693

entropic::LlamaCppBackend::run_batched_decode
std::vector< GenerationResult > run_batched_decode(const std::vector< std::vector< llama_token > > &toks, const std::vector< GenerationParams > &params, std::size_t shared, std::atomic< bool > &cancel)
Run the gh#98 multi-seq batched decode (v2.8.0).
Definition llama_cpp_backend.cpp:1849

entropic::LlamaCppBackend::last_prefill_ms
double last_prefill_ms() const
Wall-clock milliseconds spent in prefill by the last generation.
Definition llama_cpp_backend.h:263

entropic::LlamaCppBackend::do_generate
GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams &params) override
Generate a complete response using chat template.
Definition llama_cpp_backend.cpp:2535

entropic::LlamaCppBackend::reload_model_cpu_only
void reload_model_cpu_only()
Reload the model CPU-only for the WARM state (do_deactivate tail).
Definition llama_cpp_backend.cpp:663

entropic::LlamaCppBackend::mtp_active
bool mtp_active() const
True when an MTP head context is live against the current ctx_.
Definition llama_cpp_backend.h:575

entropic::LlamaCppBackend::render_with_tools
std::string render_with_tools(const std::vector< Message > &messages, const GenerationParams &params)
Render messages through common_chat WITH the active tools.
Definition llama_cpp_backend.cpp:1210

entropic::LlamaCppBackend::sample_batch_active
void sample_batch_active(std::vector< BatchSeq > &seqs)
Sample+accept+classify each still-active sequence.
Definition llama_cpp_backend.cpp:1757

entropic::LlamaCppBackend::common_chat_parse_reliable
bool common_chat_parse_reliable() const
True iff common_chat parsing is reliable for the last render (gh#87).
Definition llama_cpp_backend.cpp:1251

entropic::LlamaCppBackend::set_prompt_cache_config
void set_prompt_cache_config(const PromptCacheConfig &config)
Set prompt cache configuration.
Definition llama_cpp_backend.h:175

entropic::LlamaCppBackend::active_tools_json_
std::string active_tools_json_
MCP tool defs for next render.
Definition llama_cpp_backend.h:678

entropic::LlamaCppBackend::do_complete
GenerationResult do_complete(const std::string &prompt, const GenerationParams &params) override
Raw text completion without chat template.
Definition llama_cpp_backend.cpp:3941

entropic::LlamaCppBackend::last_prefill_tokens_
int last_prefill_tokens_
gh#96: prompt tokens decoded by last generate()
Definition llama_cpp_backend.h:636

entropic::LlamaCppBackend::tokenize
std::vector< llama_token > tokenize(const std::string &text, bool add_special) const
Tokenize text using model vocabulary.
Definition llama_cpp_backend.cpp:788

entropic::LlamaCppBackend::create_inference_context
bool create_inference_context()
Create the llama context + prompt cache (do_activate step 2).
Definition llama_cpp_backend.cpp:453

entropic::LlamaCppBackend::release_temp_seq_id_for_test
void release_temp_seq_id_for_test(llama_seq_id id)
Release a temp seq_id (test-only seam, gh#98).
Definition llama_cpp_backend.h:161

entropic::LlamaCppBackend::vocab_
const llama_vocab * vocab_
Vocabulary (from model_)
Definition llama_cpp_backend.h:635

entropic::LlamaCppBackend::tool_call_close_marker
std::string tool_call_close_marker() const override
Tool-call close marker for the captured chat format (gh#103).
Definition llama_cpp_backend.cpp:1267

entropic::LlamaCppBackend::have_chat_params_
bool have_chat_params_
True once a tool render captured params.
Definition llama_cpp_backend.h:685

entropic::LlamaCppBackend::compute_prefix_token_count
int compute_prefix_token_count(const std::vector< Message > &messages, const GenerationParams &params)
Compute token count of system messages only.
Definition llama_cpp_backend.cpp:2051

entropic::LlamaCppBackend::sampler_factory_
std::unique_ptr< SamplerFactory > sampler_factory_
Factory used by the decode loop to build per-generation samplers.
Definition llama_cpp_backend.h:669

entropic::LlamaCppBackend::release_temp_seqs
void release_temp_seqs(std::vector< BatchSeq > &seqs)
Release every batch sequence's temp seq_id (seq 0 excluded).
Definition llama_cpp_backend.cpp:1832

entropic::LlamaCppBackend::sampler_factory_for_test
SamplerFactory * sampler_factory_for_test() const
Read the currently-wired SamplerFactory (test-only).
Definition llama_cpp_backend.h:136

entropic::LlamaCppBackend::detokenize
std::string detokenize(llama_token token) const
Detokenize a single token.
Definition llama_cpp_backend.cpp:809

entropic::LlamaCppBackend::set_active_tools
void set_active_tools(const std::string &tools_json)
Stage tool definitions for the next common_chat render (gh#87).
Definition llama_cpp_backend.cpp:1192

entropic::LlamaCppBackend::init_mmproj_if_configured
void init_mmproj_if_configured()
Initialize the libmtmd context if mmproj is configured.
Definition llama_cpp_backend.cpp:490

entropic::LlamaCppBackend::last_chat_format_
int last_chat_format_
Captured common_chat_format.
Definition llama_cpp_backend.h:682

entropic::LlamaCppBackend::generate_speculative_with_draft
GenerationResult generate_speculative_with_draft(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, LlamaCppBackend &draft, int n_draft_max, const std::string &draft_path)
Speculative-decoding kernel with explicit draft backend.
Definition llama_cpp_backend.cpp:3571

entropic::LlamaCppBackend::ctx_
llama_context * ctx_
Inference context (ACTIVE)
Definition llama_cpp_backend.h:634

entropic::LlamaCppBackend::run_prefill
bool run_prefill(const std::vector< llama_token > &tokens)
Run batched prefill on input tokens.
Definition llama_cpp_backend.cpp:1484

entropic::LlamaCppBackend::run_sampling_loop
GenerationResult run_sampling_loop(const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel, const std::chrono::steady_clock::time_point &t0)
Sample tokens until stop / max_tokens / cancel.
Definition llama_cpp_backend.cpp:2440

entropic::LlamaCppBackend::next_temp_seq_id_
llama_seq_id next_temp_seq_id_
gh#98: monotonic high-water for NEW temp seq_ids (the old 1 + size() handed out duplicates when the p...
Definition llama_cpp_backend.h:1059

entropic::LlamaCppBackend::last_generation_prompt_
std::string last_generation_prompt_
Captured generation_prompt.
Definition llama_cpp_backend.h:683

entropic::LlamaCppBackend::mtp_guard
GenerationResult mtp_guard(const GenerationParams &params, const std::function< void(std::string_view)> &on_token, const std::string &head_path, int n_max)
Validate MTP run preconditions (gh#108, fail-fast/fail-loud).
Definition llama_cpp_backend.cpp:3872

entropic::LlamaCppBackend::restore_cached_prefix
bool restore_cached_prefix(const CacheEntry *cached, const std::vector< llama_token > &tokens)
Restore KV state from cache and decode remaining tokens.
Definition llama_cpp_backend.cpp:1997

entropic::LlamaCppBackend::save_prefix_to_cache
void save_prefix_to_cache(const CacheKey &key, int prefix_tokens)
Capture seq 0 KV state and store under the given key.
Definition llama_cpp_backend.cpp:2026

entropic::LlamaCppBackend::tokenize_text
std::vector< int32_t > tokenize_text(const std::string &text) const override
Tokenize text to token IDs using model vocabulary.
Definition llama_cpp_backend.cpp:837

entropic::LlamaCppBackend::mtp_n_max_
int mtp_n_max_
MTP draft window (n_max) of the live head.
Definition llama_cpp_backend.h:646

entropic::LlamaCppBackend::is_recurrent
bool is_recurrent() const
Check if loaded model is recurrent.
Definition llama_cpp_backend.cpp:3965

entropic::LlamaCppBackend::step_token
std::string step_token(Sampler &sampler, std::string &generated, std::function< void(std::string_view)> &on_token, const std::vector< std::string > &stop)
Generate one token and append to output.
Definition llama_cpp_backend.cpp:1521

entropic::LlamaCppBackend::generate_after_prefill
GenerationResult generate_after_prefill(Sampler &sampler, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
The post-prefill sampling loop (extracted from decode_loop).
Definition llama_cpp_backend.cpp:1604

entropic::LlamaCppBackend::kv_pos_max
int kv_pos_max() const
Highest occupied KV position in seq 0 right now (live query).
Definition llama_cpp_backend.h:287

entropic::LlamaCppBackend::mtmd_prefill
entropic_error_t mtmd_prefill(const std::string &prompt, const std::vector<::mtmd_bitmap * > &bitmaps, std::string &err_msg)
Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.
Definition llama_cpp_backend.cpp:2397

entropic::LlamaCppBackend::run_batch_gen_loop
void run_batch_gen_loop(std::vector< BatchSeq > &seqs, int max_steps, std::atomic< bool > &cancel)
Decode all sequences together until each finishes.
Definition llama_cpp_backend.cpp:1785

entropic::LlamaCppBackend::run_prefill_cached
bool run_prefill_cached(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
Run prefill with prompt cache integration.
Definition llama_cpp_backend.cpp:2134

entropic::LlamaCppBackend::mtp_head_path_
std::string mtp_head_path_
Path the live mtp_draft_ctx_ was built from.
Definition llama_cpp_backend.h:645

entropic::LlamaCppBackend::last_input_tokens
int last_input_tokens() const
Tokenized prompt size of the last generation (input tokens).
Definition llama_cpp_backend.h:271

entropic::LlamaCppBackend::do_generate_text_only
GenerationResult do_generate_text_only(const std::vector< Message > &messages, const GenerationParams &params)
Text-only batch generation (extracted from do_generate).
Definition llama_cpp_backend.cpp:2555

entropic::LlamaCppBackend::do_restore_state
bool do_restore_state(int seq_id, const std::vector< uint8_t > &buffer) override
Restore a sequence's KV cache from a byte buffer.
Definition llama_cpp_backend.cpp:4116

entropic::LlamaCppBackend::apply_chat_template
std::string apply_chat_template(const std::vector< Message > &messages, const GenerationParams &params) const
Apply chat template to messages.
Definition llama_cpp_backend.cpp:1154

entropic::LlamaCppBackend::parse_response
CommonChatResult parse_response(const std::string &raw) const
Parse a raw model emission via the last captured render params.
Definition llama_cpp_backend.cpp:1373

entropic::LlamaCppBackend::prefill_batch_suffixes
bool prefill_batch_suffixes(std::vector< BatchSeq > &seqs, const std::vector< std::vector< llama_token > > &toks, std::size_t shared)
Prefill each request's suffix; set per-seq logits_idx.
Definition llama_cpp_backend.cpp:1721

entropic::LlamaCppBackend::tokenizer_
std::unique_ptr< Tokenizer > tokenizer_
Tokenizer used by tokenize_text / do_count_tokens / internal tokenize/detokenize.
Definition llama_cpp_backend.h:658

entropic::LlamaCppBackend::do_generate_streaming
GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Streaming generation with per-token callback.
Definition llama_cpp_backend.cpp:2696

entropic::LlamaCppBackend::has_vision_
bool has_vision_
Cached mtmd_support_vision(mtmd_ctx_) result.
Definition llama_cpp_backend.h:1091

entropic::LlamaCppBackend::inject_tokenizer_for_test
void inject_tokenizer_for_test(std::unique_ptr< Tokenizer > tokenizer)
Inject a tokenizer for unit testing (v2.3.10).
Definition llama_cpp_backend.cpp:712

entropic::LlamaCppBackend::has_common_chat_params
bool has_common_chat_params() const
True iff the last render captured common_chat parse params (gh#87).
Definition llama_cpp_backend.h:376

entropic::LlamaCppBackend::decode_tokens_from
bool decode_tokens_from(const std::vector< llama_token > &tokens, int start_offset)
Decode tokens starting at a given offset.
Definition llama_cpp_backend.cpp:1957

entropic::LlamaCppBackend::prefill_dispatch
bool prefill_dispatch(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
Cache-aware prefill dispatch (gh#96 v2.7.5: extracted body of run_prefill_cached so the wrapper owns ...
Definition llama_cpp_backend.cpp:2258

entropic::LlamaCppBackend::release_temp_seq_id
void release_temp_seq_id(llama_seq_id seq_id)
Release a temporary sequence ID back to the pool.
Definition llama_cpp_backend.cpp:930

entropic::LlamaCppBackend::create_sampler
std::unique_ptr< Sampler > create_sampler(const GenerationParams &params) const
Build a Sampler for one generation from params.
Definition llama_cpp_backend.cpp:1468

entropic::LlamaCppBackend::llama_model_ptr
llama_model * llama_model_ptr()
Get the loaded llama_model pointer.
Definition llama_cpp_backend.h:210

entropic::LlamaCppBackend::do_count_tokens
int do_count_tokens(const std::string &text) const override
Count tokens in text.
Definition llama_cpp_backend.cpp:825

entropic::LlamaCppBackend::mtmd_ctx_
::mtmd_context * mtmd_ctx_
libmtmd context, or nullptr if no mmproj loaded.
Definition llama_cpp_backend.h:1087

entropic::LlamaCppBackend::last_parser_
std::string last_parser_
Captured serialized PEG arena.
Definition llama_cpp_backend.h:684

entropic::LlamaCppBackend::generate_multimodal
GenerationResult generate_multimodal(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel)
Multimodal generation core (v1.9.11 Phases 5–7).
Definition llama_cpp_backend.cpp:2484

entropic::LlamaCppBackend::seq_id_mutex_
std::mutex seq_id_mutex_
Guards temp seq_id pool (v1.9.10)
Definition llama_cpp_backend.h:1053

entropic::LlamaCppBackend::do_clear_state
bool do_clear_state(int seq_id) override
Clear KV cache or recurrent hidden state.
Definition llama_cpp_backend.cpp:4060

entropic::LlamaCppBackend::prefill_shared_and_fanout
bool prefill_shared_and_fanout(std::vector< BatchSeq > &seqs, const std::vector< llama_token > &seq0, std::size_t shared)
Prefill shared prefix into seq 0 + seq_cp fan-out.
Definition llama_cpp_backend.cpp:1698

entropic::LlamaCppBackend::apply_chat_template_lowlevel
std::string apply_chat_template_lowlevel(const std::vector< Message > &messages) const
Low-level GGUF template path (gh#86 fallback, v2.6.1).
Definition llama_cpp_backend.cpp:1420

entropic::LlamaCppBackend::extract_token_logprob
static float extract_token_logprob(const float *logits, int32_t next_token, int n_vocab)
Extract log-probability for a token from logits.
Definition llama_cpp_backend.cpp:948

entropic::LlamaCppBackend::do_deactivate
void do_deactivate() override
Deactivate: free context, reload model CPU-only.
Definition llama_cpp_backend.cpp:618

entropic::LlamaCppBackend::generate_mtp
GenerationResult generate_mtp(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, const std::string &head_path, int n_max)
Speculative generation via a target-owned MTP head (gh#106).
Definition llama_cpp_backend.cpp:3906

entropic::LlamaCppBackend::last_prefill_tokens
int last_prefill_tokens() const
Prompt (prefill) tokens actually decoded by the last generation.
Definition llama_cpp_backend.h:236

entropic::LlamaCppBackend::do_info
BackendInfo do_info() const override
Populate backend metadata from llama.cpp model.
Definition llama_cpp_backend.cpp:4026

entropic::LlamaCppBackend::do_generate_batch
std::vector< GenerationResult > do_generate_batch(const std::vector< std::vector< Message > > &requests, const std::vector< GenerationParams > &params, std::atomic< bool > &cancel) override
Same-prefix batch generation (gh#98, v2.8.0).
Definition llama_cpp_backend.cpp:1903

entropic::LlamaCppBackend::parse_parser_
std::string parse_parser_
Last TOOLED render's PEG arena.
Definition llama_cpp_backend.h:694

entropic::LlamaCppBackend::do_activate
bool do_activate() override
Activate model on GPU (WARM → ACTIVE).
Definition llama_cpp_backend.cpp:378

entropic::LlamaCppBackend::prepare_batch_seqs
bool prepare_batch_seqs(std::vector< BatchSeq > &seqs, const std::vector< GenerationParams > &params)
Build per-request sampler chains + seq ids.
Definition llama_cpp_backend.cpp:1679

entropic::LlamaCppBackend::build_mtp_head
bool build_mtp_head(const std::string &head_path)
Load the MTP head GGUF + create its shared-KV context (gh#106).
Definition llama_cpp_backend.cpp:568

entropic::LlamaCppBackend::prefill_and_cache_prefix
bool prefill_and_cache_prefix(const std::vector< llama_token > &tokens, int prefix_tokens, const CacheKey &key)
Two-pass prefill: prefix-only prefill → save → rest.
Definition llama_cpp_backend.cpp:2091

entropic::LlamaCppBackend::mtp_mutex_
std::mutex mtp_mutex_
gh#108: serialises MTP head setup/teardown vs in-flight generate_mtp (no deactivate-during-generate U...
Definition llama_cpp_backend.h:647

entropic::LlamaCppBackend::allocate_temp_seq_id
llama_seq_id allocate_temp_seq_id()
Allocate a temporary sequence ID for evaluation.
Definition llama_cpp_backend.cpp:914

entropic::LlamaCppBackend::prompt_cache_config_
PromptCacheConfig prompt_cache_config_
Cache config (v1.8.3)
Definition llama_cpp_backend.h:673

entropic::LlamaCppBackend::parse_chat_format_
int parse_chat_format_
Last TOOLED render's format.
Definition llama_cpp_backend.h:692

entropic::LlamaCppBackend::resident_tokens_
std::vector< llama_token > resident_tokens_
gh#96: tokens resident in KV seq 0 (warm-keep)
Definition llama_cpp_backend.h:640

entropic::LlamaCppBackend::mtp_draft_model_
llama_model * mtp_draft_model_
MTP head GGUF (separate, trunk-sharing)
Definition llama_cpp_backend.h:643

entropic::LlamaCppBackend::do_unload
void do_unload() override
Full unload — free all resources, clear prompt cache.
Definition llama_cpp_backend.cpp:744

entropic::LlamaCppBackend::model_
llama_model * model_
Loaded model (WARM+)
Definition llama_cpp_backend.h:633

entropic::LlamaCppBackend::~LlamaCppBackend
~LlamaCppBackend() override
Free llama.cpp + mtmd resources on destruction.
Definition llama_cpp_backend.cpp:695

entropic::LlamaCppBackend::invalidate_resident_kv
void invalidate_resident_kv()
gh#96 (v2.7.5): drop the warm-keep resident-KV record.
Definition llama_cpp_backend.cpp:2238

entropic::LlamaCppBackend::free_seq_ids_
std::vector< llama_seq_id > free_seq_ids_
Available temporary seq_ids (v1.9.10)
Definition llama_cpp_backend.h:1054

entropic::LlamaCppBackend::do_generate_speculative
GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel...
Definition llama_cpp_backend.cpp:2778

entropic::LlamaCppBackend::do_generate_streaming_text_only
GenerationResult do_generate_streaming_text_only(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Text-only streaming generation (extracted from streaming).
Definition llama_cpp_backend.cpp:2720

entropic::LlamaCppBackend::extract_system_prompt
static std::string extract_system_prompt(const std::vector< Message > &messages)
Extract the system prompt from messages.
Definition llama_cpp_backend.cpp:1934

entropic::LlamaCppBackend::allocate_temp_seq_id_for_test
llama_seq_id allocate_temp_seq_id_for_test()
Allocate a temp seq_id (test-only seam for gh#98).
Definition llama_cpp_backend.h:151

entropic::LlamaCppBackend::mtp_draft_ctx_
llama_context * mtp_draft_ctx_
MTP context (ctx_type=MTP, ctx_other=ctx_)
Definition llama_cpp_backend.h:644

entropic::LlamaCppBackend::inject_sampler_factory_for_test
void inject_sampler_factory_for_test(std::unique_ptr< SamplerFactory > factory)
Inject a SamplerFactory for unit testing (v2.3.10).
Definition llama_cpp_backend.cpp:733

entropic::LlamaCppBackend::last_gen_decode_calls
int last_gen_decode_calls() const
Number of batched generation decodes in the last gh#98 batch.
Definition llama_cpp_backend.h:247

entropic::LlamaCppBackend::setup_mtp_draft
bool setup_mtp_draft(const std::string &head_path, int n_max)
Lazily build the MTP head context against the live ctx_ (gh#106).
Definition llama_cpp_backend.cpp:554

entropic::SamplerFactory
Factory that materializes a Sampler from GenerationParams.
Definition sampler.h:93

entropic::Sampler
Pure-virtual per-generation sampler used by the decode loop.
Definition sampler.h:48

entropic_error_t
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35

backend.h
InferenceBackend concrete base class.

entropic
Activate model on GPU (WARM → ACTIVE).
Definition bundled_models.h:20

entropic::BackendCapability
BackendCapability
Capabilities that an inference backend may or may not support.
Definition backend_capability.h:33

entropic::BudgetMode::tokens
@ tokens
Gate on generated tokens since the last tool call.

prompt_cache.h
Host-memory KV cache state storage with LRU eviction.

sampler.h
Abstract Sampler seam for backend testability (v2.3.10).

entropic::BackendInfo
Backend metadata for introspection.
Definition backend_capability.h:58

entropic::CacheEntry
Single cached KV state snapshot.
Definition prompt_cache.h:62

entropic::CacheKey
64-bit hash used as cache lookup key.
Definition prompt_cache.h:38

entropic::GenerationParams
Generation parameters for a single inference call.
Definition config.h:302

entropic::GenerationResult
Result of a single generation call.
Definition generation_result.h:30

entropic::LlamaCppBackend::BatchSeq
Per-sequence state for the gh#98 multi-seq batched decode.
Definition llama_cpp_backend.h:806

entropic::LlamaCppBackend::BatchSeq::pos
int pos
Next KV position to write.
Definition llama_cpp_backend.h:810

entropic::LlamaCppBackend::BatchSeq::finish
std::string finish
Finish reason.
Definition llama_cpp_backend.h:816

entropic::LlamaCppBackend::BatchSeq::logits_idx
int logits_idx
Batch cell holding current logits.
Definition llama_cpp_backend.h:811

entropic::LlamaCppBackend::BatchSeq::out
std::vector< llama_token > out
Generated tokens.
Definition llama_cpp_backend.h:815

entropic::LlamaCppBackend::BatchSeq::seq_id
llama_seq_id seq_id
KV sequence id.
Definition llama_cpp_backend.h:809

entropic::LlamaCppBackend::BatchSeq::chain
llama_sampler * chain
Borrowed native chain (sampled per-idx)
Definition llama_cpp_backend.h:808

entropic::LlamaCppBackend::BatchSeq::max_tokens
int max_tokens
Per-request generation cap.
Definition llama_cpp_backend.h:813

entropic::LlamaCppBackend::BatchSeq::n_gen
int n_gen
Tokens generated so far.
Definition llama_cpp_backend.h:812

entropic::LlamaCppBackend::BatchSeq::sampler
std::unique_ptr< Sampler > sampler
Owns the per-request chain.
Definition llama_cpp_backend.h:807

entropic::LlamaCppBackend::BatchSeq::active
bool active
Still generating?
Definition llama_cpp_backend.h:814

entropic::LlamaCppBackend::CommonChatResult
Result of a common_chat parse: native tool calls + split content.
Definition llama_cpp_backend.h:304

entropic::LlamaCppBackend::CommonChatResult::tool_calls
std::vector< ToolCall > tool_calls
Extracted native tool calls.
Definition llama_cpp_backend.h:305

entropic::LlamaCppBackend::CommonChatResult::content
std::string content
Content with calls + reasoning removed.
Definition llama_cpp_backend.h:306

entropic::LlamaCppBackend::CommonChatResult::reasoning_content
std::string reasoning_content
Extracted reasoning/thought block.
Definition llama_cpp_backend.h:307

entropic::LogprobResult
Per-token log-probability evaluation result.
Definition logprob_result.h:36

entropic::ModelConfig
Model configuration for a single tier.
Definition config.h:148

entropic::PromptCacheConfig
Prompt caching configuration.
Definition config.h:266

tokenizer.h
Abstract Tokenizer seam for backend testability (v2.3.10).