LlamaCppBackend — common llama.cpp patterns (15% layer). More...

#include </home/runner/work/entropic/entropic/src/inference/llama_cpp_backend.h>

Inheritance diagram for entropic::LlamaCppBackend:

Collaboration diagram for entropic::LlamaCppBackend:

[legend]

Classes
struct	BatchSeq
	Per-sequence state for the gh#98 multi-seq batched decode. More...

struct	CommonChatResult
	Result of a common_chat parse: native tool calls + split content. More...

Public Member Functions
	~LlamaCppBackend () override
	Free llama.cpp + mtmd resources on destruction.

void	inject_tokenizer_for_test (std::unique_ptr< Tokenizer > tokenizer)
	Inject a tokenizer for unit testing (v2.3.10).

void	inject_sampler_factory_for_test (std::unique_ptr< SamplerFactory > factory)
	Inject a SamplerFactory for unit testing (v2.3.10).

SamplerFactory *	sampler_factory_for_test () const
	Read the currently-wired SamplerFactory (test-only).

llama_seq_id	allocate_temp_seq_id_for_test ()
	Allocate a temp seq_id (test-only seam for gh#98).

void	release_temp_seq_id_for_test (llama_seq_id id)
	Release a temp seq_id (test-only seam, gh#98).

void	set_prompt_cache_config (const PromptCacheConfig &config)
	Set prompt cache configuration.

void	clear_prompt_cache () override
	Drop every cached prefix so the next prefill re-seeds.

std::vector< int32_t >	tokenize_text (const std::string &text) const override
	Tokenize text to token IDs using model vocabulary.

llama_model *	llama_model_ptr ()
	Get the loaded llama_model pointer.

llama_context *	llama_context_ptr ()
	Get the active llama_context pointer.

int	last_prefill_tokens () const
	Prompt (prefill) tokens actually decoded by the last generation.

int	last_gen_decode_calls () const
	Number of batched generation decodes in the last gh#98 batch.

double	last_prefill_ms () const
	Wall-clock milliseconds spent in prefill by the last generation.

int	last_input_tokens () const
	Tokenized prompt size of the last generation (input tokens).

int	kv_pos_max () const
	Highest occupied KV position in seq 0 right now (live query).

void	set_active_tools (const std::string &tools_json)
	Stage tool definitions for the next common_chat render (gh#87).

std::string	render_with_tools (const std::vector< Message > &messages, const GenerationParams &params)
	Render messages through common_chat WITH the active tools.

CommonChatResult	parse_response (const std::string &raw) const
	Parse a raw model emission via the last captured render params.

bool	has_common_chat_params () const
	True iff the last render captured common_chat parse params (gh#87).

bool	common_chat_parse_reliable () const
	True iff common_chat parsing is reliable for the last render (gh#87).

std::string	tool_call_close_marker () const override
	Tool-call close marker for the captured chat format (gh#103).

std::vector< std::string >	effective_stop (const GenerationParams &params) const
	params.stop + the sequential tool-call close marker, if applicable.

GenerationResult	generate_speculative_with_draft (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, LlamaCppBackend &draft, int n_draft_max, const std::string &draft_path)
	Speculative-decoding kernel with explicit draft backend.

GenerationResult	generate_mtp (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, const std::string &head_path, int n_max)
	Speculative generation via a target-owned MTP head (gh#106).

bool	mtp_active () const
	True when an MTP head context is live against the current ctx_.

Public Member Functions inherited from entropic::InferenceBackend
bool	load (const ModelConfig &config)
	Load model into CPU RAM (COLD → WARM).

bool	activate ()
	Promote to GPU (WARM → ACTIVE).

void	deactivate ()
	Release GPU layers (ACTIVE → WARM).

void	unload ()
	Full unload (→ COLD).

bool	load_and_activate (const ModelConfig &config)
	Convenience: load() + activate().

GenerationResult	generate (const std::vector< Message > &messages, const GenerationParams &params)
	Generate a complete response.

GenerationResult	generate (const std::vector< Message > &messages, const GenerationParams &params, std::atomic< bool > &cancel)
	Generate a complete response with cancellation support.

GenerationResult	generate_streaming (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
	Generate with per-token streaming callback.

GenerationResult	generate_speculative (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
	Generate via the speculative-decoding kernel (v2.1.11).

GenerationResult	complete (const std::string &prompt, const GenerationParams &params)
	Raw text completion without chat template.

LogprobResult	evaluate_logprobs (const int32_t *tokens, int n_tokens)
	Evaluate per-token log-probabilities for a token sequence.

float	compute_perplexity (const int32_t *tokens, int n_tokens)
	Compute perplexity for a token sequence.

ModelState	state () const
	Current lifecycle state (lock-free read).

bool	is_active () const
	True when state is ACTIVE.

bool	is_loaded () const
	True when state is WARM or ACTIVE.

int	count_tokens (const std::string &text) const
	Count tokens using model's tokenizer.

int	context_length () const
	Model's context window size.

const ModelConfig &	config () const
	Stored model config.

bool	supports (BackendCapability cap) const
	Query whether this backend supports a capability.

std::vector< BackendCapability >	capabilities () const
	Get all supported capabilities as a vector.

BackendInfo	info () const
	Get backend metadata.

bool	save_state (int seq_id, std::vector< uint8_t > &buffer) const
	Save model state to buffer.

bool	restore_state (int seq_id, const std::vector< uint8_t > &buffer)
	Restore model state from buffer.

bool	clear_state (int seq_id=-1)
	Clear/reset model state for a sequence.

GenerationResult	generate_seq (int seq_id, const std::vector< Message > &messages, const GenerationParams &params)
	Generate with explicit sequence ID.

GenerationResult	generate_streaming_seq (int seq_id, const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
	Streaming generation with explicit sequence ID.

std::vector< GenerationResult >	generate_batch (const std::vector< std::vector< Message > > &requests, const std::vector< GenerationParams > &params, std::atomic< bool > &cancel)
	Generate N independent same-prefix requests together.

Protected Member Functions
bool	do_load (const ModelConfig &config) override
	Load model into CPU RAM (COLD → WARM).

bool	do_activate () override
	Activate model on GPU (WARM → ACTIVE).

void	do_deactivate () override
	Deactivate: free context, reload model CPU-only.

void	do_unload () override
	Full unload — free all resources, clear prompt cache.

GenerationResult	do_generate (const std::vector< Message > &messages, const GenerationParams &params) override
	Generate a complete response using chat template.

GenerationResult	do_generate (const std::vector< Message > &messages, const GenerationParams &params, std::atomic< bool > &cancel) override
	Batch generate with per-token cancel poll.

GenerationResult	do_generate_streaming (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
	Streaming generation with per-token callback.

GenerationResult	do_generate_speculative (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
	Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel entry below).

std::vector< GenerationResult >	do_generate_batch (const std::vector< std::vector< Message > > &requests, const std::vector< GenerationParams > &params, std::atomic< bool > &cancel) override
	Same-prefix batch generation (gh#98, v2.8.0).

GenerationResult	do_complete (const std::string &prompt, const GenerationParams &params) override
	Raw text completion without chat template.

int	do_count_tokens (const std::string &text) const override
	Count tokens in text.

LogprobResult	do_evaluate_logprobs (const int32_t *tokens, int n_tokens) override
	Evaluate per-token log-probabilities via sequential decode.

bool	do_supports (BackendCapability cap) const override
	Declare llama.cpp backend capabilities.

std::string	do_backend_name () const override
	Return backend name.

BackendInfo	do_info () const override
	Populate backend metadata from llama.cpp model.

bool	do_clear_state (int seq_id) override
	Clear KV cache or recurrent hidden state.

bool	do_save_state (int seq_id, std::vector< uint8_t > &buffer) const override
	Capture a sequence's KV cache into a byte buffer.

bool	do_restore_state (int seq_id, const std::vector< uint8_t > &buffer) override
	Restore a sequence's KV cache from a byte buffer.

std::vector< llama_token >	tokenize (const std::string &text, bool add_special) const
	Tokenize text using model vocabulary.

std::string	detokenize (llama_token token) const
	Detokenize a single token.

std::string	apply_chat_template (const std::vector< Message > &messages, const GenerationParams &params) const
	Apply chat template to messages.

std::string	render_prompt (const std::vector< Message > &messages, const GenerationParams &params)
	Generation render seam: common_chat-with-tools or legacy (gh#87).

std::string	apply_chat_template_lowlevel (const std::vector< Message > &messages) const
	Low-level GGUF template path (gh#86 fallback, v2.6.1).

GenerationResult	decode_loop (const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
	Core decode loop — shared by generate and streaming.

GenerationResult	generate_after_prefill (Sampler &sampler, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
	The post-prefill sampling loop (extracted from decode_loop).

std::vector< GenerationResult >	run_batched_decode (const std::vector< std::vector< llama_token > > &toks, const std::vector< GenerationParams > &params, std::size_t shared, std::atomic< bool > &cancel)
	Run the gh#98 multi-seq batched decode (v2.8.0).

bool	prepare_batch_seqs (std::vector< BatchSeq > &seqs, const std::vector< GenerationParams > &params)
	Build per-request sampler chains + seq ids.

bool	prefill_shared_and_fanout (std::vector< BatchSeq > &seqs, const std::vector< llama_token > &seq0, std::size_t shared)
	Prefill shared prefix into seq 0 + seq_cp fan-out.

bool	prefill_batch_suffixes (std::vector< BatchSeq > &seqs, const std::vector< std::vector< llama_token > > &toks, std::size_t shared)
	Prefill each request's suffix; set per-seq logits_idx.

void	run_batch_gen_loop (std::vector< BatchSeq > &seqs, int max_steps, std::atomic< bool > &cancel)
	Decode all sequences together until each finishes.

void	sample_batch_active (std::vector< BatchSeq > &seqs)
	Sample+accept+classify each still-active sequence.

std::vector< GenerationResult >	build_batch_results (std::vector< BatchSeq > &seqs)
	Detokenize each sequence into a GenerationResult.

void	release_temp_seqs (std::vector< BatchSeq > &seqs)
	Release every batch sequence's temp seq_id (seq 0 excluded).

bool	run_prefill (const std::vector< llama_token > &tokens)
	Run batched prefill on input tokens.

std::string	step_token (Sampler &sampler, std::string &generated, std::function< void(std::string_view)> &on_token, const std::vector< std::string > &stop)
	Generate one token and append to output.

std::unique_ptr< Sampler >	create_sampler (const GenerationParams &params) const
	Build a Sampler for one generation from params.

bool	run_prefill_cached (const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
	Run prefill with prompt cache integration.

bool	prefill_dispatch (const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
	Cache-aware prefill dispatch (gh#96 v2.7.5: extracted body of run_prefill_cached so the wrapper owns the perf reset+capture).

bool	try_warm_reuse (const std::vector< llama_token > &tokens)
	gh#96 (v2.7.5): try incremental prefill against resident KV.

void	invalidate_resident_kv ()
	gh#96 (v2.7.5): drop the warm-keep resident-KV record.

bool	decode_tokens_from (const std::vector< llama_token > &tokens, int start_offset)
	Decode tokens starting at a given offset.

bool	restore_cached_prefix (const CacheEntry *cached, const std::vector< llama_token > &tokens)
	Restore KV state from cache and decode remaining tokens.

bool	prefill_and_cache_prefix (const std::vector< llama_token > &tokens, int prefix_tokens, const CacheKey &key)
	Two-pass prefill: prefix-only prefill → save → rest.

void	save_prefix_to_cache (const CacheKey &key, int prefix_tokens)
	Capture seq 0 KV state and store under the given key.

int	compute_prefix_token_count (const std::vector< Message > &messages, const GenerationParams &params)
	Compute token count of system messages only.

llama_seq_id	allocate_temp_seq_id ()
	Allocate a temporary sequence ID for evaluation.

void	release_temp_seq_id (llama_seq_id seq_id)
	Release a temporary sequence ID back to the pool.

bool	is_recurrent () const
	Check if loaded model is recurrent.

GenerationResult	generate_multimodal (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel)
	Multimodal generation core (v1.9.11 Phases 5–7).

void	init_mmproj_if_configured ()
	Initialize the libmtmd context if mmproj is configured.

bool	load_gpu_model ()
	Load the GGUF model onto the GPU (do_activate step 1).

bool	create_inference_context ()
	Create the llama context + prompt cache (do_activate step 2).

bool	setup_mtp_draft (const std::string &head_path, int n_max)
	Lazily build the MTP head context against the live ctx_ (gh#106).

bool	build_mtp_head (const std::string &head_path)
	Load the MTP head GGUF + create its shared-KV context (gh#106).

void	teardown_mtp_draft ()
	Free the MTP head context + model (gh#106 lifecycle).

void	reload_model_cpu_only ()
	Reload the model CPU-only for the WARM state (do_deactivate tail).

GenerationResult	mtp_guard (const GenerationParams &params, const std::function< void(std::string_view)> &on_token, const std::string &head_path, int n_max)
	Validate MTP run preconditions (gh#108, fail-fast/fail-loud).

entropic_error_t	mtmd_prefill (const std::string &prompt, const std::vector<::mtmd_bitmap * > &bitmaps, std::string &err_msg)
	Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.

GenerationResult	run_sampling_loop (const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel, const std::chrono::steady_clock::time_point &t0)
	Sample tokens until stop / max_tokens / cancel.

GenerationResult	do_generate_text_only (const std::vector< Message > &messages, const GenerationParams &params)
	Text-only batch generation (extracted from do_generate).

GenerationResult	do_generate_text_only (const std::vector< Message > &messages, const GenerationParams &params, std::atomic< bool > &cancel)
	Text-only batch generation with per-token cancel poll.

GenerationResult	do_generate_streaming_text_only (const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
	Text-only streaming generation (extracted from streaming).

Protected Member Functions inherited from entropic::InferenceBackend
virtual GenerationResult	do_generate_seq (int seq_id, const std::vector< Message > &messages, const GenerationParams &params)
	Generate with sequence ID.

virtual GenerationResult	do_generate_streaming_seq (int seq_id, const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
	Streaming generation with sequence ID.

bool	fire_model_load_hook (const ModelConfig &config)
	Fire ON_MODEL_LOAD pre-hook.

void	set_hooks (const HookInterface &hooks)
	Set the hook dispatch interface.

Static Protected Member Functions
static std::string	extract_system_prompt (const std::vector< Message > &messages)
	Extract the system prompt from messages.

static float	extract_token_logprob (const float *logits, int32_t next_token, int n_vocab)
	Extract log-probability for a token from logits.

Protected Attributes
llama_model *	model_ = nullptr
	Loaded model (WARM+)

llama_context *	ctx_ = nullptr
	Inference context (ACTIVE)

const llama_vocab *	vocab_ = nullptr
	Vocabulary (from model_)

int	last_prefill_tokens_ = 0
	gh#96: prompt tokens decoded by last generate()

int	last_gen_decode_calls_ = 0
	gh#98: batched-decode step count of last batch

int	last_input_tokens_ = 0
	gh#97: tokenized prompt size of last generate()

double	last_prefill_ms_ = 0.0
	gh#96: prefill wall-clock ms of last generate()

std::vector< llama_token >	resident_tokens_
	gh#96: tokens resident in KV seq 0 (warm-keep)

llama_model *	mtp_draft_model_ = nullptr
	MTP head GGUF (separate, trunk-sharing)

llama_context *	mtp_draft_ctx_ = nullptr
	MTP context (ctx_type=MTP, ctx_other=ctx_)

std::string	mtp_head_path_
	Path the live mtp_draft_ctx_ was built from.

int	mtp_n_max_ = 16
	MTP draft window (n_max) of the live head.

std::mutex	mtp_mutex_
	gh#108: serialises MTP head setup/teardown vs in-flight generate_mtp (no deactivate-during-generate UAF)

std::unique_ptr< Tokenizer >	tokenizer_
	Tokenizer used by tokenize_text / do_count_tokens / internal tokenize/detokenize.

std::unique_ptr< SamplerFactory >	sampler_factory_
	Factory used by the decode loop to build per-generation samplers.

PromptCacheConfig	prompt_cache_config_
	Cache config (v1.8.3)

std::unique_ptr< PromptCache >	prompt_cache_
	KV prefix cache (v1.8.3)

std::string	active_tools_json_
	MCP tool defs for next render.

int	last_chat_format_ = 0
	Captured common_chat_format.

std::string	last_generation_prompt_
	Captured generation_prompt.

std::string	last_parser_
	Captured serialized PEG arena.

bool	have_chat_params_ = false
	True once a tool render captured params.

int	parse_chat_format_ = 0
	Last TOOLED render's format.

std::string	parse_generation_prompt_
	Last TOOLED render's gen prompt.

std::string	parse_parser_
	Last TOOLED render's PEG arena.

bool	parse_params_valid_ = false
	True once a tooled render snapshotted.

std::mutex	seq_id_mutex_
	Guards temp seq_id pool (v1.9.10)

std::vector< llama_seq_id >	free_seq_ids_
	Available temporary seq_ids (v1.9.10)

llama_seq_id	next_temp_seq_id_ = 1
	gh#98: monotonic high-water for NEW temp seq_ids (the old `1 + size()` handed out duplicates when the pool was empty — every call returned 1 — colliding the batch's sequences).

bool	is_recurrent_ = false
	True if loaded model is recurrent (GDN/Mamba/RWKV).

bool	is_hybrid_ = false
	gh#97: attention + recurrent/SSM memory

::mtmd_context *	mtmd_ctx_ = nullptr
	libmtmd context, or nullptr if no mmproj loaded.

bool	has_vision_ = false
	Cached `mtmd_support_vision(mtmd_ctx_)` result.

Protected Attributes inherited from entropic::InferenceBackend
std::string	last_error_
	Last error message for diagnostics.

std::atomic< ModelState >	state_ {ModelState::COLD}
	State transition slot accessible to subclasses for test-only injection.

Detailed Description

LlamaCppBackend — common llama.cpp patterns (15% layer).

Provides decode loop, sampler chain creation, tokenization helpers. Pinned-version subclass overrides do_load/do_activate with version-specific API calls.

Version: 1.8.3

Definition at line 65 of file llama_cpp_backend.h.

Constructor & Destructor Documentation

◆ ~LlamaCppBackend()

entropic::LlamaCppBackend::~LlamaCppBackend ( )

override

Free llama.cpp + mtmd resources on destruction.

Destructor — route to do_unload() so GPU buffers don't leak.

gh#58 v2.2.7 follow-up: the base InferenceBackend destructor is defaulted, so without this override the raw model_ / ctx_ / mtmd_ctx_ pointers leak when the backend goes out of scope. GPU buffers held by those objects stay allocated for the remainder of the process, and the next handle's GPU model load fails because llama.cpp's CUDA pool sees the prior allocations as stale-but-occupied.

@utility

Version: 2.2.8

gh#58 v2.2.7 follow-up: previously the defaulted base destructor left model_/ctx_/mtmd_ctx_ as raw pointers that were never freed. On a second handle's GPU model load, llama.cpp's CUDA pool then failed because the prior buffers were still allocated.

Definition at line 695 of file llama_cpp_backend.cpp.

Member Function Documentation

◆ allocate_temp_seq_id()

llama_seq_id entropic::LlamaCppBackend::allocate_temp_seq_id ( )

protected

Allocate a temporary sequence ID for evaluation.

Returns: Unused seq_id, or -1 if pool is exhausted.

Version: 1.9.10

Reuses a released id from the pool, else mints a fresh monotonic id. gh#98 (v2.8.0): the old 1 + free_seq_ids_.size() returned 1 on EVERY empty-pool call, so a multi-seq batch (which allocates N ids back-to-back with no intervening release) got duplicate ids → seq_id collision → shared KV slots. The monotonic counter mints distinct ids; released ids still return to the pool and are reused first, so it stays bounded by concurrent demand (≤ n_parallel for a valid batch).

Returns: Unused seq_id (starts at 1, 0 is generation).

Definition at line 914 of file llama_cpp_backend.cpp.

◆ allocate_temp_seq_id_for_test()

llama_seq_id entropic::LlamaCppBackend::allocate_temp_seq_id_for_test ( )

inline

Allocate a temp seq_id (test-only seam for gh#98).

Exposes allocate_temp_seq_id so a CPU unit test can assert the pool hands out DISTINCT ids on an empty pool — the invariant the gh#98 multi-seq batch relies on (the model test can't catch a collision because a forcing grammar makes output independent of the corrupted KV).

Returns: A temp seq_id. @utility

Version: 2.8.0

Definition at line 151 of file llama_cpp_backend.h.

◆ apply_chat_template()

std::string entropic::LlamaCppBackend::apply_chat_template	(	const std::vector< Message > &	messages,
		const GenerationParams &	params
	)		const

protected

Apply chat template to messages.

Render the GGUF chat template for messages (gh#86/gh#87).

Parameters

messages	Conversation history.
params	Generation parameters (for enable_thinking).

Returns: Formatted prompt string.

Version: 2.6.1

The legacy (tool-less) render path: delegates to the shared render_common_chat core with no tools, so the template's enable_thinking variable receives params.enable_thinking (the low-level llama_chat_apply_template() had no thinking slot — its bool arg is add_generation_prompt — so a tier's enable_thinking: false was silently dropped pre-v2.6.1). Tools are intentionally absent here; the gh#87 render_with_tools path supplies them via inputs.tools.

Falls back to the low-level template (then a plain join) if the jinja path is unavailable (e.g. a GGUF with no embedded template).

Parameters

messages	Conversation history.
params	Generation parameters (enable_thinking honored).

Returns: Formatted prompt string.

Definition at line 1154 of file llama_cpp_backend.cpp.

◆ apply_chat_template_lowlevel()

std::string entropic::LlamaCppBackend::apply_chat_template_lowlevel ( const std::vector< Message > & messages ) const

protected

Low-level GGUF template path (gh#86 fallback, v2.6.1).

Pre-v2.6.1 low-level template path (fallback for gh#86).

Parameters

messages Conversation history.

Returns: Formatted prompt string, or a plain join on failure.

Version: 2.6.1

Reads the GGUF template via llama_chat_apply_template(). Retained as the fallback when the jinja path is unavailable. Does NOT honor enable_thinking (the low-level API has no slot for it).

Parameters

messages Conversation history.

Returns: Formatted prompt string, or a plain join on failure.

Definition at line 1420 of file llama_cpp_backend.cpp.

◆ build_batch_results()

std::vector< GenerationResult > entropic::LlamaCppBackend::build_batch_results ( std::vector< BatchSeq > & seqs )

protected

Detokenize each sequence into a GenerationResult.

Detokenize each sequence into a GenerationResult (gh#98).

Version: 2.8.0

Definition at line 1813 of file llama_cpp_backend.cpp.

◆ build_mtp_head()

bool entropic::LlamaCppBackend::build_mtp_head ( const std::string & head_path )

protected

Load the MTP head GGUF + create its shared-KV context (gh#106).

Helper extracted from setup_mtp_draft to keep both under the knots returns gate. On success sets mtp_draft_model_/ctx_ + mtp_head_path_; on failure sets last_error_ and tears down any partial state.

Definition at line 568 of file llama_cpp_backend.cpp.

◆ clear_prompt_cache()

void entropic::LlamaCppBackend::clear_prompt_cache ( )

inlineoverridevirtual

Drop every cached prefix so the next prefill re-seeds.

Called by the orchestrator on identity/prompt-prefix changes. No-op when the cache has not been constructed yet. (P1-7, 2.0.6-rc16)

@utility

Version: 2.0.6-rc16

Reimplemented from entropic::InferenceBackend.

Definition at line 189 of file llama_cpp_backend.h.

◆ common_chat_parse_reliable()

bool entropic::LlamaCppBackend::common_chat_parse_reliable ( ) const

True iff common_chat parsing is reliable for the last render (gh#87).

True iff the captured common_chat format parses multi-param (gh#87).

common_chat's PEG autoparser (PEG_NATIVE / PEG_SIMPLE — inferred from a template) only extracts the FIRST <parameter=> of a multi-parameter tool call. Only the DEDICATED grammars (currently PEG_GEMMA4) parse multi-parameter calls correctly. So the orchestrator routes parsing to parse_response ONLY when the captured format is dedicated; autoparser families fall back to their hand-rolled multi-parameter adapter.

Extend the dedicated-format set here as llama.cpp gains hand-written parsers for more families.

Returns: true if parse_response is multi-parameter safe for this render. @utility

Version: 2.7.0

Only dedicated grammars (PEG_GEMMA4) are multi-parameter safe; the PEG autoparser (PEG_NATIVE/PEG_SIMPLE) drops parameters past the first. See header for the routing contract.

Returns: true if parse_response is multi-parameter safe. @utility

Version: 2.8.3

Definition at line 1251 of file llama_cpp_backend.cpp.

◆ compute_prefix_token_count()

int entropic::LlamaCppBackend::compute_prefix_token_count	(	const std::vector< Message > &	messages,
		const GenerationParams &	params
	)

protected

Compute token count of system messages only.

Compute system prefix token count from messages.

Parameters

messages	Message list.
params	Generation params (for template).

Returns: Token count, 0 if no system messages.

Version: 1.8.3

Parameters

messages	Original message list.
params	Generation params (for template).

Returns: Token count of the system prefix, 0 if no system message.

Definition at line 2051 of file llama_cpp_backend.cpp.

◆ create_inference_context()

bool entropic::LlamaCppBackend::create_inference_context ( )

protected

Create the llama context + prompt cache (do_activate step 2).

Extracted from do_activate. Builds ctx_ from model_ and the configured context params, then lazily creates the prompt cache.

Returns: true on success; sets last_error_ on failure.; true on success; sets last_error_ on failure.

Definition at line 453 of file llama_cpp_backend.cpp.

◆ create_sampler()

std::unique_ptr< Sampler > entropic::LlamaCppBackend::create_sampler ( const GenerationParams & params ) const

protected

Build a Sampler for one generation from params.

Build a Sampler for one generation via the v2.3.10 seam.

v2.3.10 seam: thin wrapper that delegates to sampler_factory_->create(params). Returns nullptr when no factory has been wired (COLD backend / production never activated). Callers must null-check before use.

Kept as a member function (vs inline at every call site) so the four legacy callers (decode_loop, run_sampling_loop, do_generate_text_only, do_generate_streaming_text_only) stay a one-liner — minimum diff to ship the seam.

Parameters

params Generation parameters.

Returns: Owned Sampler, or nullptr if no factory wired.

Version: 2.3.10

Pre-v2.3.10 this method built a llama_sampler* chain inline. v2.3.10 moves chain construction into LlamaCppSamplerFactory (see llama_cpp_sampler.cpp); this entry stays as a thin wrapper so the four legacy callers (decode_loop, run_sampling_loop, do_generate_text_only, do_generate_streaming_text_only) remain one-liners.

Returns nullptr when no factory has been wired — a guarded fallback for the COLD-state code path that exists for defensiveness (production callers only reach this from ACTIVE-only code paths, where do_activate has already installed the production factory).

Parameters

params Generation parameters.

Returns: Owned Sampler, or nullptr if no factory installed.

Definition at line 1468 of file llama_cpp_backend.cpp.

◆ decode_loop()

GenerationResult entropic::LlamaCppBackend::decode_loop	(	const std::vector< llama_token > &	tokens,
		const GenerationParams &	params,
		std::function< void(std::string_view)>	on_token,
		std::atomic< bool > *	cancel
	)

protected

Core decode loop — shared by generate and streaming.

Core decode loop shared by generate, streaming, and complete.

gh#98 (v2.8.0): the post-prefill sampling loop is now in generate_after_prefill (shared with the batch path); decode_loop is create_sampler + run_prefill + generate_after_prefill.

Parameters

tokens	Input token sequence.
params	Generation parameters.
on_token	Per-token callback (nullptr for batch).
cancel	Cancel flag (nullptr for batch).

Returns: GenerationResult.

Version: 2.8.0

gh#98 (v2.8.0): the post-prefill sampling loop is extracted to generate_after_prefill (shared with the batch path).

Parameters

tokens	Input token sequence.
params	Generation parameters.
on_token	Per-token callback (nullptr for batch).
cancel	Cancel flag (nullptr for batch).

Returns: GenerationResult.

Definition at line 1562 of file llama_cpp_backend.cpp.

◆ decode_tokens_from()

bool entropic::LlamaCppBackend::decode_tokens_from	(	const std::vector< llama_token > &	tokens,
		int	start_offset
	)

protected

Decode tokens starting at a given offset.

Decode remaining tokens starting at start_offset.

Parameters

tokens	Full token sequence.
start_offset	First token to decode.

Returns: true on success.

Version: 2.0.6

Assumes seq_pos_max(0) == start_offset - 1 so that llama_batch_get_one auto-positions tokens at start_offset onward.

Parameters

tokens	Full token sequence.
start_offset	Index of the first token to decode.

Returns: true on success, false on decode failure.

Definition at line 1957 of file llama_cpp_backend.cpp.

◆ detokenize()

std::string entropic::LlamaCppBackend::detokenize ( llama_token token ) const

protected

Detokenize a single token.

Detokenize a single token to string.

Parameters

token Token ID.

Returns: String representation.

Version: 1.8.2

Parameters

token Token ID.

Returns: String representation.

Definition at line 809 of file llama_cpp_backend.cpp.

◆ do_activate()

bool entropic::LlamaCppBackend::do_activate ( )

overrideprotectedvirtual

Activate model on GPU (WARM → ACTIVE).

Reloads model with n_gpu_layers from config, then creates inference context with KV cache. v2.2.7 (gh#61) wired cache_type_k/v. v2.2.8 (gh#58 follow-up) added the diagnostic-rich error message. v2.2.9 extracted the cparams builder to satisfy the SLOC gate.

Returns: true on success.

Implements entropic::InferenceBackend.

Definition at line 378 of file llama_cpp_backend.cpp.

◆ do_backend_name()

std::string entropic::LlamaCppBackend::do_backend_name ( ) const

overrideprotectedvirtual

Return backend name.

Returns: "llama.cpp".

Implements entropic::InferenceBackend.

Definition at line 4016 of file llama_cpp_backend.cpp.

◆ do_clear_state()

bool entropic::LlamaCppBackend::do_clear_state ( int seq_id )

overrideprotectedvirtual

Clear KV cache or recurrent hidden state.

Parameters

seq_id Sequence ID, or -1 for all sequences.

Returns: true on success.

Reimplemented from entropic::InferenceBackend.

Definition at line 4060 of file llama_cpp_backend.cpp.

◆ do_complete()

GenerationResult entropic::LlamaCppBackend::do_complete	(	const std::string &	prompt,
		const GenerationParams &	params
	)

overrideprotectedvirtual

Raw text completion without chat template.

Parameters

prompt	Raw prompt string.
params	Generation parameters.

Returns: GenerationResult.

Implements entropic::InferenceBackend.

Definition at line 3941 of file llama_cpp_backend.cpp.

◆ do_count_tokens()

int entropic::LlamaCppBackend::do_count_tokens ( const std::string & text ) const

overrideprotectedvirtual

Count tokens in text.

Parameters

text	Input text.

Returns: Token count.

Implements entropic::InferenceBackend.

Definition at line 825 of file llama_cpp_backend.cpp.

◆ do_deactivate()

void entropic::LlamaCppBackend::do_deactivate ( )

overrideprotectedvirtual

Deactivate: free context, reload model CPU-only.

gh#87 (v2.7.0): frees the GPU model BEFORE reloading CPU-only, the symmetric fix to load_gpu_model. Deactivate's whole purpose is to release VRAM and stay WARM, so freeing the GPU model first aligns with intent and avoids holding two full llama_model objects during the tier-switch reload. The old "keep GPU model on CPU-reload failure" fallback is dropped — it left VRAM pinned, contradicting the deactivation; a failed warm-reload now nulls the handle (recoverable via the next activate, which reloads from scratch).

Implements entropic::InferenceBackend.

Definition at line 618 of file llama_cpp_backend.cpp.

◆ do_evaluate_logprobs()

LogprobResult entropic::LlamaCppBackend::do_evaluate_logprobs	(	const int32_t *	tokens,
		int	n_tokens
	)

overrideprotectedvirtual

Evaluate per-token log-probabilities via sequential decode.

Clears memory, then processes tokens one at a time using the same decode path as generation. After each token, extracts logits for the next-token prediction. Compatible with recurrent/hybrid models that only support single-output-position batches.

Parameters

tokens	Token IDs to evaluate.
n_tokens	Number of tokens (minimum 2).

Returns: LogprobResult with per-transition logprobs and perplexity.

Implements entropic::InferenceBackend.

Definition at line 859 of file llama_cpp_backend.cpp.

◆ do_generate() [1/2]

GenerationResult entropic::LlamaCppBackend::do_generate	(	const std::vector< Message > &	messages,
		const GenerationParams &	params
	)

overrideprotectedvirtual

Generate a complete response using chat template.

v2.1.8 (gh#37 / v1.9.11 Phases 5–7): dispatches to generate_multimodal() when any message carries IMAGE content_parts AND the backend has vision (mmproj loaded). When images arrive but vision is not available, image parts are stripped with a warning and generation proceeds text-only.

Parameters

messages	Conversation history.
params	Generation parameters.

Returns: GenerationResult.

Implements entropic::InferenceBackend.

Definition at line 2535 of file llama_cpp_backend.cpp.

◆ do_generate() [2/2]

GenerationResult entropic::LlamaCppBackend::do_generate	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::atomic< bool > &	cancel
	)

overrideprotectedvirtual

Batch generate with per-token cancel poll.

Batch generate dispatch with cancel support (gh#81, v2.4.2).

(gh#81, v2.4.2)

Version: 2.4.2

Vision branch falls through to the streaming multimodal kernel with a null on_token (it already polls cancel). Text-only branch goes through do_generate_text_only(messages, params, cancel).

Reimplemented from entropic::InferenceBackend.

Definition at line 2610 of file llama_cpp_backend.cpp.

◆ do_generate_batch()

std::vector< GenerationResult > entropic::LlamaCppBackend::do_generate_batch	(	const std::vector< std::vector< Message > > &	requests,
		const std::vector< GenerationParams > &	params,
		std::atomic< bool > &	cancel
	)

overrideprotectedvirtual

Same-prefix batch generation (gh#98, v2.8.0).

Same-prefix batch generation override (gh#98, v2.8.0).

On a plain-KV arch with a real shared prefix and enough sequence slots (batch_is_viable), prefills the shared prefix once into seq 0 and runs each request's suffix + generation off that prefix, seq_rm-resetting the tail between requests — the proven warm-keep mechanism, applied N times. Otherwise falls back to the serial base implementation. Each request is sampled under its own grammar (option A).

Version: 2.8.0

Tokenizes each request, computes the shared prefix, and takes the batch fast-path when batch_is_viable (plain KV, real shared prefix, fits the seq slots + decode batch); otherwise falls back to the serial base path. The serial fallback is always correct — it is the unmodified single-request path per request — so disjoint prompts, hybrid archs, and over-capacity batches degrade gracefully with no regression.

Parameters

requests	Per-request message lists.
params	Per-request generation params.
cancel	Cancel flag.

Returns: One result per request, in input order.

Reimplemented from entropic::InferenceBackend.

Definition at line 1903 of file llama_cpp_backend.cpp.

◆ do_generate_speculative()

GenerationResult entropic::LlamaCppBackend::do_generate_speculative	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::function< void(std::string_view token)>	on_token,
		std::atomic< bool > &	cancel
	)

overrideprotectedvirtual

Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel entry below).

Abstract speculative entry point.

The actual draft-pair-aware kernel lives in generate_speculative_with_draft and is called by the orchestrator after it has resolved the draft backend. This abstract override exists for backends with implicit draft resolution; LlamaCppBackend requires an explicit draft handle.

Returns: GenerationResult with NOT_SUPPORTED.

Version: 2.1.11

LlamaCppBackend requires an explicit draft handle, so the abstract single-backend variant returns NOT_SUPPORTED. The orchestrator dynamic_casts both target and draft and calls generate_speculative_with_draft directly. (v2.1.11, gh#36)

Returns: GenerationResult with NOT_SUPPORTED.

Reimplemented from entropic::InferenceBackend.

Definition at line 2778 of file llama_cpp_backend.cpp.

◆ do_generate_streaming()

GenerationResult entropic::LlamaCppBackend::do_generate_streaming	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::function< void(std::string_view token)>	on_token,
		std::atomic< bool > &	cancel
	)

overrideprotectedvirtual

Streaming generation with per-token callback.

Parameters

messages	Conversation history.
params	Generation parameters.
on_token	Per-token callback.
cancel	Atomic cancel flag.

Returns: GenerationResult.

Implements entropic::InferenceBackend.

Definition at line 2696 of file llama_cpp_backend.cpp.

◆ do_generate_streaming_text_only()

GenerationResult entropic::LlamaCppBackend::do_generate_streaming_text_only	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::function< void(std::string_view token)>	on_token,
		std::atomic< bool > &	cancel
	)

protected

Text-only streaming generation (extracted from streaming).

Text-only streaming body (v2.1.8, extracted for knots SLOC).

@utility

Version: 2.1.8

Definition at line 2720 of file llama_cpp_backend.cpp.

◆ do_generate_text_only() [1/2]

GenerationResult entropic::LlamaCppBackend::do_generate_text_only	(	const std::vector< Message > &	messages,
		const GenerationParams &	params
	)

protected

Text-only batch generation (extracted from do_generate).

Text-only generate body (v2.1.8, extracted for knots SLOC).

@utility

Version: 2.1.8

Definition at line 2555 of file llama_cpp_backend.cpp.

◆ do_generate_text_only() [2/2]

GenerationResult entropic::LlamaCppBackend::do_generate_text_only	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::atomic< bool > &	cancel
	)

protected

Text-only batch generation with per-token cancel poll.

Text-only batch generate with per-token cancel poll (gh#81, v2.4.2).

gh#81 (v2.4.2): mirrors the per-token cancel poll already in do_generate_streaming_text_only. When cancel is set mid-decode the loop breaks within one token; the result is tagged finish_reason="cancelled" and error_code=ENTROPIC_ERROR_CANCELLED.

@utility

Version: 2.4.2

Identical to the no-cancel overload above except for the cancel poll inside the decode loop. See its docs for the prefill / sampler contract.

Definition at line 2636 of file llama_cpp_backend.cpp.

◆ do_info()

BackendInfo entropic::LlamaCppBackend::do_info ( ) const

overrideprotectedvirtual

Populate backend metadata from llama.cpp model.

Returns: BackendInfo with model-specific details.

Reimplemented from entropic::InferenceBackend.

Definition at line 4026 of file llama_cpp_backend.cpp.

◆ do_load()

bool entropic::LlamaCppBackend::do_load ( const ModelConfig & config )

overrideprotectedvirtual

Load model into CPU RAM (COLD → WARM).

Uses llama_model_load_from_file with n_gpu_layers=0 for CPU-only mmap+mlock loading. Model stays in page cache for fast reactivation.

Parameters

config Validated model config.

Returns: true on success.

Implements entropic::InferenceBackend.

Definition at line 273 of file llama_cpp_backend.cpp.

◆ do_restore_state()

bool entropic::LlamaCppBackend::do_restore_state	(	int	seq_id,
		const std::vector< uint8_t > &	buffer
	)

overrideprotectedvirtual

Restore a sequence's KV cache from a byte buffer.

Restore one sequence's KV cache via llama_state_seq_set_data.

Wraps llama.cpp's llama_state_seq_set_data. Required by the v2.3.25 entropic_state_load C API.

Parameters

seq_id	Sequence id.
buffer	Source buffer (output of a prior save_state).

Returns: true when llama_state_seq_set_data accepts the buffer.

gh#23 MVP item 13 (v2.4.0 follow-up to v2.3.25).

Parameters

seq_id	llama sequence id.
buffer	Bytes captured by a prior `do_save_state` against the SAME model + same llama.cpp commit pin.

Returns: true when llama_state_seq_set_data reports a non-zero accepted size (per the llama.cpp contract: positive=ok, zero=failed to load).

Reimplemented from entropic::InferenceBackend.

Definition at line 4116 of file llama_cpp_backend.cpp.

◆ do_save_state()

bool entropic::LlamaCppBackend::do_save_state	(	int	seq_id,
		std::vector< uint8_t > &	buffer
	)		const

overrideprotectedvirtual

Capture a sequence's KV cache into a byte buffer.

Capture one sequence's KV cache via llama_state_seq_get_data.

Wraps llama.cpp's llama_state_seq_get_size + llama_state_seq_get_data. Required by the v2.3.25 entropic_state_save C API; the base class default returns false (no state support).

Parameters

seq_id	Sequence id (default 0 from the C API path).
buffer	Output buffer; resized to exact state size.

Returns: true on success; false when not active or copy short-reads.

gh#23 MVP item 13 (v2.3.25 + v2.4.0). v2.3.25 shipped the C API surface (entropic_state_save / entropic_state_load) but the backend layer was never overridden — the base-class stub returned false, so the C API always reported ENTROPIC_ERROR_INTERNAL against a real model. The v2.4.0 minor-release ceremony surfaced this; the fix wires llama.cpp's sequence-scoped state API here.

Parameters

seq_id	llama sequence id (the C API path passes 0).
buffer	Output; resized to the exact state size.

Returns: true when llama.cpp emitted the full sized blob.

Reimplemented from entropic::InferenceBackend.

Definition at line 4089 of file llama_cpp_backend.cpp.

◆ do_supports()

bool entropic::LlamaCppBackend::do_supports ( BackendCapability cap ) const

overrideprotectedvirtual

Declare llama.cpp backend capabilities.

Parameters

cap	Capability to check.

Returns: true if this backend supports the capability.

Reimplemented from entropic::InferenceBackend.

Definition at line 3978 of file llama_cpp_backend.cpp.

◆ do_unload()

void entropic::LlamaCppBackend::do_unload ( )

overrideprotectedvirtual

Full unload — free all resources, clear prompt cache.

Implements entropic::InferenceBackend.

Definition at line 744 of file llama_cpp_backend.cpp.

◆ effective_stop()

std::vector< std::string > entropic::LlamaCppBackend::effective_stop ( const GenerationParams & params ) const

params.stop + the sequential tool-call close marker, if applicable.

params.stop plus the sequential close marker (gh#105).

gh#105 (v2.8.3): the gh#103 sequential hard-stop must be injected with THIS generation's family marker, derived AFTER render captured the format. Each decode body calls this post-render and feeds the result to step_token — so the marker matches the current render (gh#103 injected it pre-render off the previous/empty format, so it never fired on the first call). Gated on params.tool_call_mode == "sequential"; otherwise returns params.stop unchanged (batch is byte-identical).

Parameters

params Generation params (read-only).

Returns: Effective stop-sequence list for the decode loop. @utility

Version: 2.8.3

See header.

Called post-render by each decode body so the marker reflects THIS generation's captured format (the live capture). append_sequential_stop is a no-op unless params.tool_call_mode == "sequential".

Parameters

params Generation params.

Returns: Effective stop list. @utility

Version: 2.8.3

Definition at line 1286 of file llama_cpp_backend.cpp.

◆ extract_system_prompt()

std::string entropic::LlamaCppBackend::extract_system_prompt ( const std::vector< Message > & messages )

staticprotected

Extract the system prompt from messages.

Extract system prompt text from message list.

Parameters

messages Conversation history.

Returns: System prompt text, empty if no system message.

Version: 1.8.3

Parameters

messages Conversation history.

Returns: System message content, empty if none found.

Definition at line 1934 of file llama_cpp_backend.cpp.

◆ extract_token_logprob()

float entropic::LlamaCppBackend::extract_token_logprob	(	const float *	logits,
		int32_t	next_token,
		int	n_vocab
	)

staticprotected

Extract log-probability for a token from logits.

Computes log_softmax(logits)[next_token] using the numerically stable form: logits[t] - max - log(sum(exp(logits - max))).

Parameters

logits	Raw logits array from llama_get_logits_ith().
next_token	The token to score.
n_vocab	Vocabulary size.

Returns: log P(next_token | context).

Version: 1.9.10

Uses numerically stable log-softmax: log P(t) = logits[t] - max - log(sum(exp(logits - max)))

Parameters

logits	Raw logits array.
next_token	Token to score.
n_vocab	Vocabulary size.

Returns: log P(next_token | context).

Definition at line 948 of file llama_cpp_backend.cpp.

◆ generate_after_prefill()

GenerationResult entropic::LlamaCppBackend::generate_after_prefill	(	Sampler &	sampler,
		const GenerationParams &	params,
		std::function< void(std::string_view)>	on_token,
		std::atomic< bool > *	cancel
	)

protected

The post-prefill sampling loop (extracted from decode_loop).

Post-prefill sampling loop (gh#98, v2.8.0 extraction from decode_loop).

gh#98 (v2.8.0): assumes the prompt is already prefilled into seq 0 (its last-position logits are ready) and runs the step_token loop. Reused by decode_loop (after run_prefill) and by the same-prefix batch path (after decode_tokens_from of a request's suffix off the shared prefix).

Parameters

sampler	Per-request sampler (carries its grammar).
params	Generation parameters (max_tokens, stop).
on_token	Streaming callback (empty for batch).
cancel	Cancel flag (nullptr for batch).

Returns: GenerationResult with content/finish_reason/token_count.

The prompt is already prefilled into seq 0; runs step_token until EOS / stop / max_tokens / cancel. Shared by decode_loop (after run_prefill) and the same-prefix batch path (after a suffix decode_tokens_from).

Parameters

sampler	Per-request sampler (carries its grammar).
params	Generation parameters (max_tokens, stop).
on_token	Streaming callback (empty for batch).
cancel	Cancel flag (nullptr for batch).

Returns: GenerationResult with content/finish_reason/token_count.

Definition at line 1604 of file llama_cpp_backend.cpp.

◆ generate_mtp()

GenerationResult entropic::LlamaCppBackend::generate_mtp	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::function< void(std::string_view token)>	on_token,
		std::atomic< bool > &	cancel,
		const std::string &	head_path,
		int	n_max
	)

Speculative generation via a target-owned MTP head (gh#106).

Unlike gh#36's separate-draft kernel, the MTP head (a small trunk-sharing GGUF, e.g. mtp-gemma-4-E2B-it.gguf) shares the target's KV via ctx_other and reads the target's hidden states. The head context is created lazily against the live ctx_ (and torn down on deactivate). The decode loop is lossless: every draft is verified against the target's own logits.

Parameters

messages	Conversation history.
params	Generation parameters (samplers, max_tokens, seed).
on_token	Callback fired once per accepted token.
cancel	Cancellation flag (polled between accept rounds).
head_path	Path to the MTP head GGUF.
n_max	Draft window (proposed tokens per round; ≤0 → 16).

Returns: GenerationResult.

Version: 2.9.0

gh#108: holds mtp_mutex_ across head setup + the whole decode so a concurrent deactivate/unload teardown cannot free mtp_draft_ctx_ mid-flight (gh#58 UAF). Fails loudly (no plain-decode fallback) when the request is outside the MTP envelope — see mtp_guard / mtp_unsupported_reason.

Definition at line 3906 of file llama_cpp_backend.cpp.

◆ generate_multimodal()

GenerationResult entropic::LlamaCppBackend::generate_multimodal	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::function< void(std::string_view token)>	on_token,
		std::atomic< bool > *	cancel
	)

protected

Multimodal generation core (v1.9.11 Phases 5–7).

Multimodal generation core (v2.1.8, gh#37 / v1.9.11 Phase 6).

Runs the libmtmd-backed prefill + decode for messages whose content_parts contain image entries. Builds an mtmd_bitmap list from ContentPart paths, inserts media markers in the chat-formatted prompt, then calls mtmd_helper_eval_chunks to encode images and decode all chunks in order.

After eval, sampling proceeds via the normal step_token loop — the cache state is positioned past the multimodal prefill.

Parameters

messages	Conversation history (must contain images).
params	Generation parameters.
on_token	Per-token callback (nullptr for batch mode).
cancel	Cancel atomic (nullptr for batch mode).

Returns: GenerationResult.

Version: 2.1.8

Definition at line 2484 of file llama_cpp_backend.cpp.

◆ generate_speculative_with_draft()

GenerationResult entropic::LlamaCppBackend::generate_speculative_with_draft	(	const std::vector< Message > &	messages,
		const GenerationParams &	params,
		std::function< void(std::string_view token)>	on_token,
		std::atomic< bool > &	cancel,
		LlamaCppBackend &	draft,
		int	n_draft_max,
		const std::string &	draft_path
	)

Speculative-decoding kernel with explicit draft backend.

Speculative generation against a draft model (gh#36).

Adapts the upstream speculative-simple reference loop at pin 253ba110b to entropic's idioms: drives a draft LlamaCppBackend through common_speculative_*, verifies in batch on the target, and emits one on_token callback per accepted token (not per proposed) — preserving the standard streaming contract. Honors cancel between accept rounds; latency is one accept round (typically 1–8 tokens).

Correctness contract: output distribution bit-identical to plain decode on rejection cases. Verified by test_speculative_correctness.cpp against Qwen3.6-A3B target

Qwen3.5-0.8B draft.

Constraints (v2.1.11):

Both target and draft must report common_context_can_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL. Falls back to NOT_SUPPORTED otherwise (partial-acceptance checkpoint restore is deferred — see decision log #41).
Both backends must be ACTIVE.
Caller (orchestrator) is responsible for compat verification before calling — this entry trusts the pair.

Parameters

messages	Conversation history.
params	Generation parameters (samplers + max_tokens + seed).
on_token	Callback fired once per accepted token.
cancel	Cancellation flag (polled between accept rounds).
draft	Draft backend (must be ACTIVE).

Returns: GenerationResult with content, token_count, finish_reason.

Version: 2.1.11

Definition at line 3571 of file llama_cpp_backend.cpp.

◆ has_common_chat_params()

bool entropic::LlamaCppBackend::has_common_chat_params ( ) const

inline

True iff the last render captured common_chat parse params (gh#87).

Set when render_with_tools (via the internal render seam) renders with active tools; cleared on a tool-less render. The orchestrator queries this to route post-generation parsing through parse_response vs the legacy adapter parse_tool_calls.

Returns: true if a tool render captured params for parse_response. @utility

Version: 2.7.0

Definition at line 376 of file llama_cpp_backend.h.

◆ init_mmproj_if_configured()

void entropic::LlamaCppBackend::init_mmproj_if_configured ( )

protected

Initialize the libmtmd context if mmproj is configured.

Initialize libmtmd context if mmproj is configured (v2.1.8).

@utility

Version: 2.1.8

Extracted from do_activate to keep that function under the knots SLOC threshold. mtmd holds a reference to the live model_, so init runs after the GPU reload and before any generation. Failure is non-fatal — the engine falls back to text-only with a logged diagnostic.

Definition at line 490 of file llama_cpp_backend.cpp.

◆ inject_sampler_factory_for_test()

void entropic::LlamaCppBackend::inject_sampler_factory_for_test ( std::unique_ptr< SamplerFactory > factory )

Inject a SamplerFactory for unit testing (v2.3.10).

Wire a mock SamplerFactory for unit tests (v2.3.10 seam).

Bypasses do_activate() to wire a mock factory into the backend so the decode loop's chain-construction + per-token sampling paths can be exercised without a real llama_context. Production code MUST NOT call this — use activate() for real lifecycle.

Symmetry: matches inject_tokenizer_for_test in shape so the two v2.3.10 seams (Tokenizer + Sampler) feel consistent at the test surface. No #ifdef ENTROPIC_TESTING — the build-flag conditional creates a test-vs-production code-shape divergence we don't want for coverage measurement.

Parameters

factory Mock factory (ownership transferred). @utility

Version: 2.3.10

Sister hook to inject_tokenizer_for_test. Does NOT touch state_ — the decode loop's is_loaded()-gated entry points are already covered by the tokenizer seam, and Sampler tests exercise the factory at a finer grain than full backend lifecycle. Tests that need a "WARM" backend can chain both inject_*_for_test calls (tokenizer flips state, sampler factory then plugs in without disturbing it).

Definition at line 733 of file llama_cpp_backend.cpp.

◆ inject_tokenizer_for_test()

void entropic::LlamaCppBackend::inject_tokenizer_for_test ( std::unique_ptr< Tokenizer > tokenizer )

Inject a tokenizer for unit testing (v2.3.10).

Wire a mock Tokenizer for unit tests + mark the backend as WARM so is_loaded()-gated public methods route to it.

Bypasses do_load() to wire a mock Tokenizer into the backend AND mark the backend as WARM so is_loaded()-gated public methods (tokenize_text, do_count_tokens via base count_tokens) route through the mock. Production code MUST NOT call this — use load() / activate() for real model lifecycle.

Distinct entrypoint name + no production caller ⇒ no risk of accidental misuse; #ifdef ENTROPIC_TESTING was considered but rejected (the build-flag conditional creates a test-vs-production code-shape divergence we don't want for coverage measurement).

Parameters

tokenizer Mock tokenizer (ownership transferred). @utility

Version: 2.3.10

Implementation note: state_ is set directly to WARM here rather than via the normal load() path because there's no underlying model. The destructor's do_unload() handles cleanup — the tokenizer_.reset() happens BEFORE the (nullptr) model_/vocab_ are touched, so no dangling-borrow risk.

Definition at line 712 of file llama_cpp_backend.cpp.

◆ invalidate_resident_kv()

void entropic::LlamaCppBackend::invalidate_resident_kv ( )

protected

gh#96 (v2.7.5): drop the warm-keep resident-KV record.

Called by every non-text-cached generate path (multimodal, complete, speculative) that mutates seq 0 out-of-band, so the next text turn cannot reuse a stale record. @utility

Out-of-line so the header carries no inline body here (an inline body in this region makes the knots complexity counter attribute the whole declaration run to the preceding declaration). @utility

Definition at line 2238 of file llama_cpp_backend.cpp.

◆ is_recurrent()

bool entropic::LlamaCppBackend::is_recurrent ( ) const

protected

Check if loaded model is recurrent.

Returns: true if GDN/Mamba/RWKV architecture.

Version: 1.9.13

Returns: true if GDN/Mamba/RWKV architecture.

Definition at line 3965 of file llama_cpp_backend.cpp.

◆ kv_pos_max()

int entropic::LlamaCppBackend::kv_pos_max ( ) const

inline

Highest occupied KV position in seq 0 right now (live query).

gh#97 (v2.7.6): a correct prefill leaves exactly input_tokens positions resident, so after a generation this is ~input + generated - 1. If a prefix-reuse path (warm-keep / cache restore) leaves an un-removed tail — e.g. partial seq_rm rejected by recurrent/hybrid memory — this INFLATES beyond the prompt size and eventually exhausts the context with the cache mostly empty. Exposed so the gh#97 hybrid test can assert no inflation, catching the desync immediately instead of at exhaustion.

Returns: seq-0 pos_max, or -1 if not active. @utility

Version: 2.7.6

Definition at line 287 of file llama_cpp_backend.h.

◆ last_gen_decode_calls()

int entropic::LlamaCppBackend::last_gen_decode_calls ( ) const

inline

Number of batched generation decodes in the last gh#98 batch.

Observable that the multi-seq batched-decode engaged: ≈ the longest per-request output (one decode per step over all sequences), NOT the serial Σ output_len. @utility

Version: 2.8.0

Definition at line 247 of file llama_cpp_backend.h.

◆ last_input_tokens()

int entropic::LlamaCppBackend::last_input_tokens ( ) const

inline

Tokenized prompt size of the last generation (input tokens).

Returns: Input token count of the most recent generate() call. @utility

Version: 2.7.6

Definition at line 271 of file llama_cpp_backend.h.

◆ last_prefill_ms()

double entropic::LlamaCppBackend::last_prefill_ms ( ) const

inline

Wall-clock milliseconds spent in prefill by the last generation.

gh#96 (v2.7.5): steady_clock around the prefill dispatch in run_prefill_cached — the real per-turn cost the optimization targets (token count is the deterministic proxy; this is the wall-clock the consumer actually feels). Climbs with history today; should flatten once warm-keep reuse lands. The realized magnitude scales with model size + context length (small on a tiny model, large on the consumer's MoE at long context). Measured directly, not via llama_perf.

Returns: Prefill wall-clock ms for the most recent generate() call. @utility

Version: 2.7.5

Definition at line 263 of file llama_cpp_backend.h.

◆ last_prefill_tokens()

int entropic::LlamaCppBackend::last_prefill_tokens ( ) const

inline

Prompt (prefill) tokens actually decoded by the last generation.

gh#96 (v2.7.5): counted directly as tokens pushed through llama_decode during prefill (run_prefill + decode_tokens_from accumulate into it; reset per generation in run_prefill_cached). A prompt-cache HIT restores the system prefix without a decode, so this is the re-decoded post-system remainder — in a multi-turn loop it climbs every turn today, and should collapse to the per-turn delta once warm-keep reuse lands. Exposed for the gh#96 behavioral tests to assert reuse without log-scraping. (Counted directly rather than via llama_perf n_p_eval, which proved unreliable across the state-restore boundary.)

Returns: Prompt tokens decoded in the most recent generate() call. @utility

Version: 2.7.5

Definition at line 236 of file llama_cpp_backend.h.

◆ llama_context_ptr()

llama_context * entropic::LlamaCppBackend::llama_context_ptr ( )

inline

Get the active llama_context pointer.

Returns: nullptr if state is not ACTIVE. @utility

Version: 1.9.2

Definition at line 218 of file llama_cpp_backend.h.

◆ llama_model_ptr()

llama_model * entropic::LlamaCppBackend::llama_model_ptr ( )

inline

Get the loaded llama_model pointer.

Returns: nullptr if state is COLD. @utility

Version: 1.9.2

Definition at line 210 of file llama_cpp_backend.h.

◆ load_gpu_model()

bool entropic::LlamaCppBackend::load_gpu_model ( )

protected

Load the GGUF model onto the GPU (do_activate step 1).

Extracted from do_activate to keep it knots-clean. Sets model_ + vocab_ on success; sets last_error_ and returns false otherwise.

Returns: true on success.; true on success; sets last_error_ on failure.

gh#87 (v2.7.0): frees the WARM CPU model BEFORE reloading with GPU layers. The prior order (load new_model, then free the old model_) held two full llama_model objects resident during a single activate — a transient double-load. Changing n_gpu_layers requires a reload (you cannot re-offload an already-loaded model), but there is no reason to keep the discarded CPU copy alive across it: a failed GPU reload leaves the backend unusable regardless, and the COLD path can recover it. Freeing first removes the simultaneity (and the duplicate model metadata/buffers) without changing the load contract.

Definition at line 408 of file llama_cpp_backend.cpp.

◆ mtmd_prefill()

entropic_error_t entropic::LlamaCppBackend::mtmd_prefill	(	const std::string &	prompt,
		const std::vector<::mtmd_bitmap * > &	bitmaps,
		std::string &	err_msg
	)

protected

Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.

mtmd prefill helper (v2.1.8) — tokenize + eval chunks.

Parameters

	prompt	Marker-substituted chat-formatted prompt.
	bitmaps	Loaded image bitmaps in marker order (borrowed).
[out]	err_msg	Filled on failure.

Returns: ENTROPIC_OK on success, error code on failure. @utility

Version: 2.1.8

Wraps mtmd_tokenize + mtmd_helper_eval_chunks. KV cache is cleared first so prefill always starts at seq position 0. Bitmap ownership stays with the caller (mtmd_tokenize borrows for the call).

Definition at line 2397 of file llama_cpp_backend.cpp.

◆ mtp_active()

bool entropic::LlamaCppBackend::mtp_active ( ) const

inline

True when an MTP head context is live against the current ctx_.

Returns: true if mtp_draft_ctx_ is set (head ready), false otherwise. @utility

Version: 2.9.0

Definition at line 575 of file llama_cpp_backend.h.

◆ mtp_guard()

GenerationResult entropic::LlamaCppBackend::mtp_guard	(	const GenerationParams &	params,
		const std::function< void(std::string_view)> &	on_token,
		const std::string &	head_path,
		int	n_max
	)

protected

Validate MTP run preconditions (gh#108, fail-fast/fail-loud).

Validate the MTP run preconditions (gh#108).

ACTIVE → envelope (temp/grammar/streaming via mtp_unsupported_reason) → head setup → draft-window bound. Returns an ENTROPIC_OK result to proceed, or a typed loud error (caller propagates it — no fallback).

Returns an OK result to proceed, or a typed loud error (no fallback) when MTP cannot run.

Order: ACTIVE → envelope (grammar/streaming) → head setup → draft-window bound. Runs the envelope check BEFORE loading the head so a misconfigured call fails fast without a wasted GGUF load. gh#108 (v2.9.2): tools are NO LONGER guarded (MTP+tools is lossless-correct; stops now honored). gh#108 (v2.9.3): flash_attn is NO LONGER guarded — the extern/llama.cpp pin is past upstream #25148, which fixes the GQA-2 flash-attn abort the guard existed for. gh#108 (v2.9.4): temperature is NO LONGER guarded — the MTP draft proposal is a deterministic point mass (see mtp_envelope.h), so the existing exact-match accept step is already lossless at any temperature.

Definition at line 3872 of file llama_cpp_backend.cpp.

◆ parse_response()

LlamaCppBackend::CommonChatResult entropic::LlamaCppBackend::parse_response ( const std::string & raw ) const

Parse a raw model emission via the last captured render params.

Parse a raw emission via the last captured render params (gh#87).

Reconstructs the common_chat_parser_params from the state captured by the most recent render_with_tools and runs common_chat_parse. The converting ctor copies only format + generation_prompt, so this explicitly load()s the serialized PEG arena — without it the parser silently degrades to pure content and extracts zero tool calls (gh#87 Increment-1 finding). If no render has captured params, returns the raw text as content with no calls.

Parameters

raw	Raw model output (assistant turn only, no generation prompt).

Returns: Parsed tool calls + cleaned content + reasoning.

MUST load() the serialized PEG arena — the parser_params ctor copies only format + generation_prompt, so without the load the parser silently degrades to pure content (Increment-1 finding). See header for contract.

gh#90 (v2.7.2): coerces numeric scalars back to strings for string-typed params (the gemma <|"|> escape loses type through PEG_GEMMA4).

gh#106 (v2.9.0): strips Gemma 4 QAT <|channel> reasoning into reasoning_content.

Parameters

raw	Raw model output (assistant turn only).

Returns: Parsed tool calls + cleaned content + reasoning.

Definition at line 1373 of file llama_cpp_backend.cpp.

◆ prefill_and_cache_prefix()

bool entropic::LlamaCppBackend::prefill_and_cache_prefix	(	const std::vector< llama_token > &	tokens,
		int	prefix_tokens,
		const CacheKey &	key
	)

protected

Two-pass prefill: prefix-only prefill → save → rest.

Prefill in two passes: prefix → save → remainder.

Parameters

tokens	Full token sequence.
prefix_tokens	System prefix token count.
key	Cache key to store under.

Returns: true on success.

Version: 2.0.6

The v2.0.6 correctness fix. llama_state_seq_get_data has no range parameter, so any save captures whatever KV state happens to be in seq 0 at save time. By prefilling ONLY the system prefix first, saving, and then continuing with the rest of the prompt, we guarantee the cache entry covers exactly prefix_tokens positions — no residue from later conversation tokens can leak into a subsequent delegation's cache hit.

If prefix_tokens is 0 or >= total tokens, falls back to a plain full prefill without caching (nothing meaningful to cache).

Parameters

tokens	Full token sequence.
prefix_tokens	System prefix token count.
key	Cache key for the prefix.

Returns: true on success.

Definition at line 2091 of file llama_cpp_backend.cpp.

◆ prefill_batch_suffixes()

bool entropic::LlamaCppBackend::prefill_batch_suffixes	(	std::vector< BatchSeq > &	seqs,
		const std::vector< std::vector< llama_token > > &	toks,
		std::size_t	shared
	)

protected

Prefill each request's suffix; set per-seq logits_idx.

Prefill every request's suffix in one multi-seq batch (gh#98).

Version: 2.8.0

Each sequence's suffix tokens are decoded at their real positions; only the last token of each carries logits, recorded as that seq's first sample idx.

Definition at line 1721 of file llama_cpp_backend.cpp.

◆ prefill_dispatch()

bool entropic::LlamaCppBackend::prefill_dispatch	(	const std::vector< llama_token > &	tokens,
		const std::string &	system_prompt,
		const std::vector< Message > &	messages,
		const GenerationParams &	params
	)

protected

Cache-aware prefill dispatch (gh#96 v2.7.5: extracted body of run_prefill_cached so the wrapper owns the perf reset+capture).

Cache-aware prefill dispatch (pre-v2.7.5 run_prefill_cached body).

Parameters

tokens	Full token sequence.
system_prompt	System prompt text for cache key.
messages	Original messages (for prefix boundary).
params	Generation parameters.

Returns: true on success.

Version: 2.7.5

On cache hit: restore prefix KV and decode the remainder. On cache miss: two-pass prefill (prefix → save → remainder) so the stored cache entry contains prefix-only state. Extracted from run_prefill_cached in v2.7.5 so the wrapper can own the gh#96 perf reset + capture.

Parameters

tokens	Full token sequence.
system_prompt	System prompt text for cache key.
messages	Original messages (for prefix boundary).
params	Generation parameters.

Returns: true on success.

Definition at line 2258 of file llama_cpp_backend.cpp.

◆ prefill_shared_and_fanout()

bool entropic::LlamaCppBackend::prefill_shared_and_fanout	(	std::vector< BatchSeq > &	seqs,
		const std::vector< llama_token > &	seq0,
		std::size_t	shared
	)

protected

Prefill shared prefix into seq 0 + seq_cp fan-out.

Prefill the shared prefix into seq 0 and seq_cp it to the others.

Version: 2.8.0

Definition at line 1698 of file llama_cpp_backend.cpp.

◆ prepare_batch_seqs()

bool entropic::LlamaCppBackend::prepare_batch_seqs	(	std::vector< BatchSeq > &	seqs,
		const std::vector< GenerationParams > &	params
	)

protected

Build per-request sampler chains + seq ids.

Build per-request sampler chains + KV sequence ids (gh#98).

Version: 2.8.0

Returns: false if any sampler chain could not be built.

Definition at line 1679 of file llama_cpp_backend.cpp.

◆ release_temp_seq_id()

void entropic::LlamaCppBackend::release_temp_seq_id ( llama_seq_id seq_id )

protected

Release a temporary sequence ID back to the pool.

Parameters

seq_id The seq_id to release.

Version: 1.9.10

Parameters

seq_id The seq_id to release.

Definition at line 930 of file llama_cpp_backend.cpp.

◆ release_temp_seq_id_for_test()

void entropic::LlamaCppBackend::release_temp_seq_id_for_test ( llama_seq_id id )

inline

Release a temp seq_id (test-only seam, gh#98).

Parameters

id	The seq_id to release. @utility

Version: 2.8.0

Definition at line 161 of file llama_cpp_backend.h.

◆ release_temp_seqs()

void entropic::LlamaCppBackend::release_temp_seqs ( std::vector< BatchSeq > & seqs )

protected

Release every batch sequence's temp seq_id (seq 0 excluded).

Release every batch sequence's temp seq_id (seq 0 excluded, gh#98).

Version: 2.8.0

Definition at line 1832 of file llama_cpp_backend.cpp.

◆ reload_model_cpu_only()

void entropic::LlamaCppBackend::reload_model_cpu_only ( )

protected

Reload the model CPU-only for the WARM state (do_deactivate tail).

Extracted from do_deactivate to keep it under the knots ABC gate.

Extracted from do_deactivate to keep it under the knots ABC gate. On success rebinds model_/vocab_/tokenizer_; on failure leaves model_ null (recoverable — the next activate reloads from scratch).

Definition at line 663 of file llama_cpp_backend.cpp.

◆ render_prompt()

std::string entropic::LlamaCppBackend::render_prompt	(	const std::vector< Message > &	messages,
		const GenerationParams &	params
	)

protected

Generation render seam: common_chat-with-tools or legacy (gh#87).

Generation render seam (gh#87): tools → common_chat, else legacy.

Routes to render_with_tools (capturing parse params) when tools have been staged via set_active_tools, else to apply_chat_template (clearing any stale captured params). Used by every message-generate entry point so the common_chat path is reachable from generate(), not just raw complete(). Raw completion + the router path bypass this.

Parameters

messages	Conversation history.
params	Generation parameters.

Returns: Formatted prompt string.

Clears any stale captured params on the tool-less branch so has_common_chat_params() reflects only THIS render. See header.

Parameters

messages	Conversation history.
params	Generation parameters.

Returns: Formatted prompt string.

Definition at line 1175 of file llama_cpp_backend.cpp.

◆ render_with_tools()

std::string entropic::LlamaCppBackend::render_with_tools	(	const std::vector< Message > &	messages,
		const GenerationParams &	params
	)

Render messages through common_chat WITH the active tools.

Render messages through common_chat WITH active tools (gh#87).

Like apply_chat_template but routes the staged tool defs into inputs.tools and CAPTURES the rendered common_chat_params (format, generation_prompt, serialized PEG parser) so a subsequent parse_response can decode the model's emission in the same context the render established. Falls back to the low-level template (no capture) if the jinja path is unavailable.

Parameters

messages	Conversation history.
params	Generation parameters (enable_thinking honored).

Returns: Formatted prompt string.

Captures the rendered params (format/generation_prompt/parser) so parse_response can decode the emission. See header for contract.

Parameters

messages	Conversation history.
params	Generation parameters.

Returns: Formatted prompt string.

Definition at line 1210 of file llama_cpp_backend.cpp.

◆ restore_cached_prefix()

bool entropic::LlamaCppBackend::restore_cached_prefix	(	const CacheEntry *	cached,
		const std::vector< llama_token > &	tokens
	)

protected

Restore KV state from cache and decode remaining tokens.

Restore cached prefix KV and decode remaining tokens.

Parameters

cached	Cache entry to restore.
tokens	Full token sequence.

Returns: true on success, false to fall back to full prefill.

Version: 2.0.6

After v2.0.6 the cached state contains ONLY the system prefix (two-pass prefill saves at the prefix boundary, see prefill_and_cache_prefix). Restore is therefore clean by construction: llama_state_seq_set_data leaves seq 0 with exactly cached->token_count positions filled, and llama_batch_get_one auto-positions subsequent decodes at that boundary.

Parameters

cached	Cache entry to restore from.
tokens	Full token sequence.

Returns: true on success, false to fall back to full prefill.

Definition at line 1997 of file llama_cpp_backend.cpp.

◆ run_batch_gen_loop()

void entropic::LlamaCppBackend::run_batch_gen_loop	(	std::vector< BatchSeq > &	seqs,
		int	max_steps,
		std::atomic< bool > &	cancel
	)

protected

Decode all sequences together until each finishes.

Decode all sequences together until each finishes (gh#98).

Version: 2.8.0

One llama_decode per step over the still-active sequences' just-sampled tokens — the multi-seq throughput win. last_gen_decode_calls_ counts the decodes (≈ longest output), the observable that batching engaged (vs N·len for a serial fallback).

Definition at line 1785 of file llama_cpp_backend.cpp.

◆ run_batched_decode()

std::vector< GenerationResult > entropic::LlamaCppBackend::run_batched_decode	(	const std::vector< std::vector< llama_token > > &	toks,
		const std::vector< GenerationParams > &	params,
		std::size_t	shared,
		std::atomic< bool > &	cancel
	)

protected

Run the gh#98 multi-seq batched decode (v2.8.0).

Prefills the shared prefix once into seq 0, seq_cps it to N sequences, prefills each request's suffix, then decodes all sequences together — one llama_decode per step over N tokens, each sampled under its own grammar chain. Caller guarantees batch_is_viable (plain KV, n<=n_parallel, shared>0, suffixes fit n_batch). Returns one result per request.

Version: 2.8.0

Prefill shared once → seq_cp fan-out → batched suffix prefill → batched generation loop (one decode/step over N sequences, each sampled under its own grammar). last_prefill_tokens_ holds shared + Σ suffix (prefix prefilled once); last_gen_decode_calls_ holds the batched step count.

Definition at line 1849 of file llama_cpp_backend.cpp.

◆ run_prefill()

bool entropic::LlamaCppBackend::run_prefill ( const std::vector< llama_token > & tokens )

protected

Run batched prefill on input tokens.

Parameters

tokens Input token sequence.

Returns: true on success.

Version: 1.8.2

Parameters

tokens Input token sequence.

Returns: true on success.

Definition at line 1484 of file llama_cpp_backend.cpp.

◆ run_prefill_cached()

bool entropic::LlamaCppBackend::run_prefill_cached	(	const std::vector< llama_token > &	tokens,
		const std::string &	system_prompt,
		const std::vector< Message > &	messages,
		const GenerationParams &	params
	)

protected

Run prefill with prompt cache integration.

Run prefill with prompt cache integration (perf-instrumented wrapper).

Parameters

tokens	Full token sequence.
system_prompt	System prompt text for cache key.
messages	Original messages (for prefix boundary).
params	Generation parameters.

Returns: true on success.

Version: 1.8.3

Resets the llama perf counters, dispatches to the cache-aware prefill (prefill_dispatch), then captures this generation's prompt-eval token count (gh#96). Thin wrapper so the dispatch body stays under the knots SLOC gate.

Parameters

tokens	Full token sequence.
system_prompt	System prompt text for cache key.
messages	Original messages (for prefix boundary).
params	Generation parameters.

Returns: true on success.

Definition at line 2134 of file llama_cpp_backend.cpp.

◆ run_sampling_loop()

GenerationResult entropic::LlamaCppBackend::run_sampling_loop	(	const GenerationParams &	params,
		std::function< void(std::string_view token)>	on_token,
		std::atomic< bool > *	cancel,
		const std::chrono::steady_clock::time_point &	t0
	)

protected

Sample tokens until stop / max_tokens / cancel.

Shared sampling loop (v2.1.8).

Shared by generate_multimodal and the text-only paths after prefill. Operates on the already-positioned ctx_ KV cache.

Parameters

params	Generation parameters.
on_token	Per-token callback (nullable).
cancel	Atomic cancel flag (nullable).
t0	Generation start time for finalize_result.

Returns: GenerationResult. @utility

Version: 2.1.8

Operates on the already-positioned ctx_ KV cache. Mirrors the body of both text-only generation variants but factored out so generate_multimodal can reuse it after mtmd_prefill.

Definition at line 2440 of file llama_cpp_backend.cpp.

◆ sample_batch_active()

void entropic::LlamaCppBackend::sample_batch_active ( std::vector< BatchSeq > & seqs )

protected

Sample+accept+classify each still-active sequence.

Sample + accept + classify each still-active sequence (gh#98).

Version: 2.8.0

Definition at line 1757 of file llama_cpp_backend.cpp.

◆ sampler_factory_for_test()

SamplerFactory * entropic::LlamaCppBackend::sampler_factory_for_test ( ) const

inline

Read the currently-wired SamplerFactory (test-only).

Returns nullptr when no factory has been wired yet (the COLD state of sampler_factory_). Exists so seam tests can assert factory presence without poking at private members.

Returns: Borrowed factory pointer (do not free); nullptr if unset. @utility

Version: 2.3.10

Definition at line 136 of file llama_cpp_backend.h.

◆ save_prefix_to_cache()

void entropic::LlamaCppBackend::save_prefix_to_cache	(	const CacheKey &	key,
		int	prefix_tokens
	)

protected

Capture seq 0 KV state and store under the given key.

Parameters

key	Cache key.
prefix_tokens	Token count to record with the entry.

Version: 2.0.6

The caller is responsible for ensuring seq 0 currently contains exactly prefix_tokens positions — this function trusts that contract and serializes whatever is there.

Parameters

key	Cache key for the prefix.
prefix_tokens	Number of prefix tokens currently in seq 0.

Definition at line 2026 of file llama_cpp_backend.cpp.

◆ set_active_tools()

void entropic::LlamaCppBackend::set_active_tools ( const std::string & tools_json )

Stage tool definitions for the next common_chat render (gh#87).

Stage tool defs for the next common_chat render (gh#87).

Accepts entropic's MCP tool-list JSON (array of {name, description, inputSchema}), as produced by ServerManager::list_tools() and filtered per-tier. The defs flow into common_chat's inputs.tools at render time so the model is instructed in — and emits — its native tool-call wire format. Pass "" or "[]" to clear.

Parameters

tools_json JSON array of MCP tool definitions (borrowed). @utility

Version: 2.7.0

Parameters

tools_json MCP tool-list JSON array. @utility

Version: 2.7.0

Definition at line 1192 of file llama_cpp_backend.cpp.

◆ set_prompt_cache_config()

void entropic::LlamaCppBackend::set_prompt_cache_config ( const PromptCacheConfig & config )

inline

Set prompt cache configuration.

Must be called before activate(). The config is consumed when the cache is constructed during do_activate().

Parameters

config Prompt cache configuration. @utility

Version: 1.8.3

Definition at line 175 of file llama_cpp_backend.h.

◆ setup_mtp_draft()

bool entropic::LlamaCppBackend::setup_mtp_draft	(	const std::string &	head_path,
		int	n_max
	)

protected

Lazily build the MTP head context against the live ctx_ (gh#106).

Loads the head GGUF (full GPU) and creates a context with ctx_type=MTP, ctx_other=ctx_, n_rs_seq=0 — the shared-KV gemma4-assistant topology. Idempotent: a no-op when the live head already matches head_path; rebuilds when ctx_ changed.

Parameters

head_path	Path to the MTP head GGUF.
n_max	Draft window (stored for the decode loop).

Returns: true when mtp_draft_ctx_ is live; false (with last_error_) on failure.

Mirrors the server's separate-head branch (server-context.cpp:944-956): load the head GGUF, then a context with ctx_type=MTP, ctx_other=ctx_, n_rs_seq=0 so its KV is the target's (shared-memory gemma4-assistant topology). The remaining cparams inherit the target's geometry so the shared KV dimensions match. Idempotent when the live head already matches head_path; otherwise tears the stale head down and rebuilds.

Definition at line 554 of file llama_cpp_backend.cpp.

◆ step_token()

std::string entropic::LlamaCppBackend::step_token	(	Sampler &	sampler,
		std::string &	generated,
		std::function< void(std::string_view)> &	on_token,
		const std::vector< std::string > &	stop
	)

protected

Generate one token and append to output.

Parameters

sampler	Sampler used for the per-token draw (v2.3.10 seam).
generated	Accumulated output (mutated).
on_token	Streaming callback.
stop	Stop sequences.

Returns: "continue", "stop", "eos", or "error".

Version: 2.3.10

v2.3.10: routes per-token draw through the Sampler seam (sampler.sample()) instead of calling llama_sampler_sample directly. The production LlamaCppSampler captured ctx_ at construction; mocks bypass llama.cpp entirely.

Parameters

sampler	Sampler used for the per-token draw.
generated	Accumulated output string (mutated).
on_token	Streaming callback (may be nullptr).
stop	Stop sequences.

Returns: "continue", "stop", "eos", or "error".

Definition at line 1521 of file llama_cpp_backend.cpp.

◆ teardown_mtp_draft()

void entropic::LlamaCppBackend::teardown_mtp_draft ( )

protected

Free the MTP head context + model (gh#106 lifecycle).

Must run BEFORE ctx_ is freed (mtp_draft_ctx_ borrows it via ctx_other). Called from do_deactivate / do_unload and idempotent.

Ordered context-then-model, both null-guarded so it is idempotent and safe to call from do_deactivate / do_unload / the destructor. MUST run before ctx_ is freed: mtp_draft_ctx_ borrows ctx_ via ctx_other.

Note: Does NOT lock mtp_mutex_ — it is also reached from within the already-locked generate_mtp→setup_mtp_draft rebuild path, so locking here would self-deadlock. Callers that race generate_mtp (do_deactivate/do_unload) hold mtp_mutex_ around the call instead.

Definition at line 529 of file llama_cpp_backend.cpp.

◆ tokenize()

std::vector< llama_token > entropic::LlamaCppBackend::tokenize	(	const std::string &	text,
		bool	add_special
	)		const

protected

Tokenize text using model vocabulary.

Parameters

text	Input text.
add_special	Add special tokens (BOS).

Returns: Token vector.

Version: 1.8.2

Parameters

text	Input text.
add_special	Add BOS/EOS special tokens.

Returns: Vector of token IDs.

Definition at line 788 of file llama_cpp_backend.cpp.

◆ tokenize_text()

std::vector< int32_t > entropic::LlamaCppBackend::tokenize_text ( const std::string & text ) const

overridevirtual

Tokenize text to token IDs using model vocabulary.

Parameters

text	Input text.

Returns: Token ID vector with BOS.

Version: 1.10.2

Parameters

text	Input text.

Returns: Token ID vector with BOS. @utility

Version: 1.10.2

Reimplemented from entropic::InferenceBackend.

Definition at line 837 of file llama_cpp_backend.cpp.

◆ tool_call_close_marker()

std::string entropic::LlamaCppBackend::tool_call_close_marker ( ) const

overridevirtual

Tool-call close marker for the captured chat format (gh#103).

Maps the last render's last_chat_format_ to the string that terminates a single tool call in that family's native emission, so the orchestrator can append it to GenerationParams.stop for "sequential" tiers (hard-stop at the first closed tool call). Returns "" when no tool render captured params or the format has no reliable marker → caller injects no stop (batch behavior). Markers track the vendored PEG parser defaults (chat-peg-parser.cpp) and are validated empirically by the gh#103 model test; CONTENT_ONLY/unknown → "".

Returns: Close marker for the active format, or "". @utility

Version: 2.8.2

See the header + tool_call_markers.h. "" when no tool render captured params (no format) or the format has no confirmed per-call marker.

Returns: Close marker, or "". @utility

Version: 2.8.2

Reimplemented from entropic::InferenceBackend.

Definition at line 1267 of file llama_cpp_backend.cpp.

◆ try_warm_reuse()

bool entropic::LlamaCppBackend::try_warm_reuse ( const std::vector< llama_token > & tokens )

protected

gh#96 (v2.7.5): try incremental prefill against resident KV.

gh#96 (v2.7.5): incremental prefill against resident KV.

If warm-keep is enabled and the resident KV holds a usable token-prefix of tokens (warm_keep_cut > 0), seq_rm the divergent tail and decode only the delta, advancing in place. Returns false (no KV mutation) to fall back to a cold prefill on: warm-keep off, no/short resident match, or out-of-band KV mutation (occupancy mismatch).

Parameters

tokens Full incoming token sequence.

Returns: true if reuse handled the prefill; false to fall back.

Version: 2.7.5

See header. The seq_rm here operates on LIVE (never state-restored) KV cells, so the partial-removal restriction noted in architecture decision #35 (non-contiguous restored cells) does not apply. decode_tokens_from auto-positions the delta at cut (same mechanism the cache-restore path relies on). Occupancy is derived from llama_memory_seq_pos_max, never a software counter — so an out-of-band wipe (multimodal / complete / speculative / a different conversation interleaved on a shared backend) either fails the warm_keep_cut occupancy gate or diverges in the prefix scan, and we fall back. Per-turn cost shrinks from the whole post-system history to just the appended delta.

Parameters

tokens Full incoming token sequence.

Returns: true if reuse handled the prefill; false to fall back (no KV change).

Definition at line 2200 of file llama_cpp_backend.cpp.

Member Data Documentation

◆ active_tools_json_

std::string entropic::LlamaCppBackend::active_tools_json_

protected

MCP tool defs for next render.

Definition at line 678 of file llama_cpp_backend.h.

◆ ctx_

llama_context* entropic::LlamaCppBackend::ctx_ = nullptr

protected

Inference context (ACTIVE)

Definition at line 634 of file llama_cpp_backend.h.

◆ free_seq_ids_

std::vector<llama_seq_id> entropic::LlamaCppBackend::free_seq_ids_

protected

Available temporary seq_ids (v1.9.10)

Definition at line 1054 of file llama_cpp_backend.h.

◆ has_vision_

bool entropic::LlamaCppBackend::has_vision_ = false

protected

Cached mtmd_support_vision(mtmd_ctx_) result.

Version: 2.1.8

Definition at line 1091 of file llama_cpp_backend.h.

◆ have_chat_params_

bool entropic::LlamaCppBackend::have_chat_params_ = false

protected

True once a tool render captured params.

Definition at line 685 of file llama_cpp_backend.h.

◆ is_hybrid_

bool entropic::LlamaCppBackend::is_hybrid_ = false

protected

gh#97: attention + recurrent/SSM memory

Definition at line 1069 of file llama_cpp_backend.h.

◆ is_recurrent_

bool entropic::LlamaCppBackend::is_recurrent_ = false

protected

True if loaded model is recurrent (GDN/Mamba/RWKV).

Set during do_load() from llama_model_is_recurrent(). Drives capability reporting (KV_CACHE vs HIDDEN_STATE, speculative decoding compatibility, etc.).

Version: 1.9.13

Definition at line 1068 of file llama_cpp_backend.h.

◆ last_chat_format_

int entropic::LlamaCppBackend::last_chat_format_ = 0

protected

Captured common_chat_format.

Definition at line 682 of file llama_cpp_backend.h.

◆ last_gen_decode_calls_

int entropic::LlamaCppBackend::last_gen_decode_calls_ = 0

protected

gh#98: batched-decode step count of last batch

Definition at line 637 of file llama_cpp_backend.h.

◆ last_generation_prompt_

std::string entropic::LlamaCppBackend::last_generation_prompt_

protected

Captured generation_prompt.

Definition at line 683 of file llama_cpp_backend.h.

◆ last_input_tokens_

int entropic::LlamaCppBackend::last_input_tokens_ = 0

protected

gh#97: tokenized prompt size of last generate()

Definition at line 638 of file llama_cpp_backend.h.

◆ last_parser_

std::string entropic::LlamaCppBackend::last_parser_

protected

Captured serialized PEG arena.

Definition at line 684 of file llama_cpp_backend.h.

◆ last_prefill_ms_

double entropic::LlamaCppBackend::last_prefill_ms_ = 0.0

protected

gh#96: prefill wall-clock ms of last generate()

Definition at line 639 of file llama_cpp_backend.h.

◆ last_prefill_tokens_

int entropic::LlamaCppBackend::last_prefill_tokens_ = 0

protected

gh#96: prompt tokens decoded by last generate()

Definition at line 636 of file llama_cpp_backend.h.

◆ model_

llama_model* entropic::LlamaCppBackend::model_ = nullptr

protected

Loaded model (WARM+)

Definition at line 633 of file llama_cpp_backend.h.

◆ mtmd_ctx_

::mtmd_context* entropic::LlamaCppBackend::mtmd_ctx_ = nullptr

protected

libmtmd context, or nullptr if no mmproj loaded.

Allocated in do_activate() when ModelConfig::mmproj_path is set; freed in do_deactivate()/do_unload(). The leading :: is required — without it mtmd_context resolves into the entropic:: namespace and the forward declaration becomes an incompatible incomplete type at the call sites.

Version: 2.1.8

Definition at line 1087 of file llama_cpp_backend.h.

◆ mtp_draft_ctx_

llama_context* entropic::LlamaCppBackend::mtp_draft_ctx_ = nullptr

protected

MTP context (ctx_type=MTP, ctx_other=ctx_)

Definition at line 644 of file llama_cpp_backend.h.

◆ mtp_draft_model_

llama_model* entropic::LlamaCppBackend::mtp_draft_model_ = nullptr

protected

MTP head GGUF (separate, trunk-sharing)

Definition at line 643 of file llama_cpp_backend.h.

◆ mtp_head_path_

std::string entropic::LlamaCppBackend::mtp_head_path_

protected

Path the live mtp_draft_ctx_ was built from.

Definition at line 645 of file llama_cpp_backend.h.

◆ mtp_mutex_

std::mutex entropic::LlamaCppBackend::mtp_mutex_

protected

gh#108: serialises MTP head setup/teardown vs in-flight generate_mtp (no deactivate-during-generate UAF)

Definition at line 647 of file llama_cpp_backend.h.

◆ mtp_n_max_

int entropic::LlamaCppBackend::mtp_n_max_ = 16

protected

MTP draft window (n_max) of the live head.

Definition at line 646 of file llama_cpp_backend.h.

◆ next_temp_seq_id_

llama_seq_id entropic::LlamaCppBackend::next_temp_seq_id_ = 1

protected

gh#98: monotonic high-water for NEW temp seq_ids (the old 1 + size() handed out duplicates when the pool was empty — every call returned 1 — colliding the batch's sequences).

Released ids return to free_seq_ids_ and are reused first, so this stays bounded by concurrent demand.

Definition at line 1059 of file llama_cpp_backend.h.

◆ parse_chat_format_

int entropic::LlamaCppBackend::parse_chat_format_ = 0

protected

Last TOOLED render's format.

Definition at line 692 of file llama_cpp_backend.h.

◆ parse_generation_prompt_

std::string entropic::LlamaCppBackend::parse_generation_prompt_

protected

Last TOOLED render's gen prompt.

Definition at line 693 of file llama_cpp_backend.h.

◆ parse_params_valid_

bool entropic::LlamaCppBackend::parse_params_valid_ = false

protected

True once a tooled render snapshotted.

Definition at line 695 of file llama_cpp_backend.h.

◆ parse_parser_

std::string entropic::LlamaCppBackend::parse_parser_

protected

Last TOOLED render's PEG arena.

Definition at line 694 of file llama_cpp_backend.h.

◆ prompt_cache_

std::unique_ptr<PromptCache> entropic::LlamaCppBackend::prompt_cache_

protected

KV prefix cache (v1.8.3)

Definition at line 674 of file llama_cpp_backend.h.

◆ prompt_cache_config_

PromptCacheConfig entropic::LlamaCppBackend::prompt_cache_config_

protected

Cache config (v1.8.3)

Definition at line 673 of file llama_cpp_backend.h.

◆ resident_tokens_

std::vector<llama_token> entropic::LlamaCppBackend::resident_tokens_

protected

gh#96: tokens resident in KV seq 0 (warm-keep)

Definition at line 640 of file llama_cpp_backend.h.

◆ sampler_factory_

std::unique_ptr<SamplerFactory> entropic::LlamaCppBackend::sampler_factory_

protected

Factory used by the decode loop to build per-generation samplers.

Created in do_activate against the live ctx_ / vocab_, destroyed in do_deactivate BEFORE those resources are freed (the factory borrows them). Unit tests inject a MockSamplerFactory via inject_sampler_factory_for_test to exercise the decode-loop call site without a real context.

Version: 2.3.10

Definition at line 669 of file llama_cpp_backend.h.

◆ seq_id_mutex_

std::mutex entropic::LlamaCppBackend::seq_id_mutex_

protected

Guards temp seq_id pool (v1.9.10)

Definition at line 1053 of file llama_cpp_backend.h.

◆ tokenizer_

std::unique_ptr<Tokenizer> entropic::LlamaCppBackend::tokenizer_

protected

Tokenizer used by tokenize_text / do_count_tokens / internal tokenize/detokenize.

Created in do_load against the loaded vocab, destroyed in do_unload BEFORE the model is freed (the vocab pointer is borrowed). Unit tests inject a MockTokenizer via the test-only constructor to exercise the LlamaCppBackend public methods without a real model.

Version: 2.3.10

Definition at line 658 of file llama_cpp_backend.h.

◆ vocab_

const llama_vocab* entropic::LlamaCppBackend::vocab_ = nullptr

protected

Vocabulary (from model_)

Definition at line 635 of file llama_cpp_backend.h.

The documentation for this class was generated from the following files:

inference/llama_cpp_backend.h
inference/llama_cpp_backend.cpp

Classes

Public Member Functions

Protected Member Functions

Static Protected Member Functions

Protected Attributes

Detailed Description

Constructor & Destructor Documentation

◆ ~LlamaCppBackend()

Member Function Documentation

◆ allocate_temp_seq_id()

◆ allocate_temp_seq_id_for_test()

◆ apply_chat_template()

◆ apply_chat_template_lowlevel()

◆ build_batch_results()

◆ build_mtp_head()

◆ clear_prompt_cache()

◆ common_chat_parse_reliable()

◆ compute_prefix_token_count()

◆ create_inference_context()

◆ create_sampler()

◆ decode_loop()

◆ decode_tokens_from()

◆ detokenize()

◆ do_activate()

◆ do_backend_name()

◆ do_clear_state()

◆ do_complete()

◆ do_count_tokens()

◆ do_deactivate()

◆ do_evaluate_logprobs()

◆ do_generate() [1/2]

◆ do_generate() [2/2]

◆ do_generate_batch()

◆ do_generate_speculative()

◆ do_generate_streaming()

◆ do_generate_streaming_text_only()

◆ do_generate_text_only() [1/2]

◆ do_generate_text_only() [2/2]

◆ do_info()

◆ do_load()

◆ do_restore_state()

◆ do_save_state()

◆ do_supports()

◆ do_unload()

◆ effective_stop()

◆ extract_system_prompt()

◆ extract_token_logprob()

◆ generate_after_prefill()

◆ generate_mtp()

◆ generate_multimodal()

◆ generate_speculative_with_draft()

◆ has_common_chat_params()

◆ init_mmproj_if_configured()

◆ inject_sampler_factory_for_test()

◆ inject_tokenizer_for_test()

◆ invalidate_resident_kv()

◆ is_recurrent()

◆ kv_pos_max()

◆ last_gen_decode_calls()

◆ last_input_tokens()

◆ last_prefill_ms()

◆ last_prefill_tokens()

◆ llama_context_ptr()

◆ llama_model_ptr()

◆ load_gpu_model()

◆ mtmd_prefill()

◆ mtp_active()

◆ mtp_guard()

◆ parse_response()

◆ prefill_and_cache_prefix()

◆ prefill_batch_suffixes()

◆ prefill_dispatch()

◆ prefill_shared_and_fanout()

◆ prepare_batch_seqs()

◆ release_temp_seq_id()

◆ release_temp_seq_id_for_test()

◆ release_temp_seqs()

◆ reload_model_cpu_only()

◆ render_prompt()

◆ render_with_tools()