LlamaCppBackend implementation — direct llama.cpp C API. More...

#include "llama_cpp_backend.h"
#include "llama_cpp_sampler.h"
#include "llama_cpp_tokenizer.h"
#include "warm_keep_util.h"
#include "tool_call_markers.h"
#include "batch_util.h"
#include "mtp_envelope.h"
#include <entropic/inference/adapters/adapter_base.h>
#include <entropic/types/logging.h>
#include <common.h>
#include <chat.h>
#include <sampling.h>
#include <speculative.h>
#include <mtmd.h>
#include <mtmd-helper.h>
#include <nlohmann/json.hpp>
#include <cmath>
#include <cstring>
#include <optional>
#include <stdexcept>

Include dependency graph for llama_cpp_backend.cpp:

Go to the source code of this file.

Classes
struct	entropic::SpeculativeRunState
	Bundles per-kernel-run mutable state to keep the loop body focused on its responsibility (knots: cognitive ≤ 15, ≤ 3 returns). More...

Namespaces
namespace	entropic
	Activate model on GPU (WARM → ACTIVE).

Functions
static std::vector< llama_chat_message >	entropic::to_llama_chat (const std::vector< Message > &messages)
	Convert engine messages to llama_chat_message views.

static std::vector< common_chat_msg >	entropic::to_common_chat (const std::vector< Message > &messages)
	Convert engine messages to common_chat_msg (gh#86, v2.6.1).

static std::vector< common_chat_tool >	entropic::mcp_tools_to_common_chat (const std::string &tools_json)
	Convert entropic MCP tool JSON to common_chat_tool defs (gh#87).

static ToolCall	entropic::to_entropic_tool_call (const common_chat_tool_call &cc)
	Map a common_chat_tool_call to entropic's ToolCall (gh#87).

static std::optional< common_chat_params >	entropic::render_common_chat (llama_model *model, const std::vector< Message > &messages, const GenerationParams &params, const std::vector< common_chat_tool > &tools)
	Shared common_chat render core for both template paths (gh#87).

static std::string	entropic::concat_messages_fallback (const std::vector< Message > &messages)
	Plain "role: content" join used when templating fails.

void	entropic::strip_thinking_channels (std::string &content, std::string *reasoning_out)
	Strip Gemma 4 QAT reasoning channels (`<\|channel>…<channel\|>`) from content, accumulating the stripped text into reasoning.

static GenerationResult	entropic::batch_error_result (const std::string &msg)
	Build a single error GenerationResult (gh#98 batch failures).

static void	entropic::fill_batch_cell (llama_batch &b, int k, llama_token tok, llama_pos pos, llama_seq_id seq, bool want_logits)
	Fill one cell of a multi-seq llama_batch.

static void	entropic::spec_cleanup (SpeculativeRunState &state)
	Free everything allocated by the kernel.

static void	entropic::spec_build_batch (SpeculativeRunState &state)
	Build the target batch [id_last, draft0, ..., draftN-1].

static bool	entropic::spec_decode_both (SpeculativeRunState &state)
	Decode the speculative batch on both contexts.

static int	entropic::spec_run_draft (SpeculativeRunState &state)
	Trigger draft generation via common_speculative_draft.

static std::string	entropic::spec_emit_token (SpeculativeRunState &state, llama_token id, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
	Emit on_token for one accepted id, updating state and returning a stop signal when terminating conditions apply.

static void	entropic::spec_ckpt_save_dft (SpeculativeRunState &state)
	Drive one accept round: draft → decode → sample-and-accept → emit tokens.

static void	entropic::spec_ckpt_save_tgt (SpeculativeRunState &state)
	Snapshot target state right before the target decode of the speculative batch (when use_ckpt_tgt + non-empty draft).

static void	entropic::spec_ckpt_restore_dft (SpeculativeRunState &state)
	Restore the draft's pre-draft state so the upcoming target-batch decode on the draft re-fills cleanly.

static void	entropic::spec_rollback_partial (SpeculativeRunState &state, common_sampler *smpl_save, std::vector< llama_token > &ids)
	Partial-acceptance rollback: restore both contexts and the sampler to their pre-draft state, then arrange for the outer loop to re-decode with the partial accept as the new draft.

static void	entropic::spec_trim_rejected_drafts (SpeculativeRunState &state)
	Clear any stale KV positions left by rejected draft tokens.

static bool	entropic::spec_commit_accepted (SpeculativeRunState &state, const std::vector< llama_token > &ids, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
	Walk accepted ids, emit tokens via callback, update state.

static int	entropic::spec_prepare_draft (SpeculativeRunState &state)
	Drive one accept round: optional draft generation, decode on both contexts, sample-and-accept, emit tokens (or roll back via checkpoint on partial acceptance).

static bool	entropic::spec_accept_round (SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
	Run one speculative accept round; return false to stop.

static std::string	entropic::spec_check_preconditions (bool target_active, bool draft_active, llama_context ctx_tgt, llama_context ctx_dft)
	Validate speculative preconditions and reject NO-seq_rm.

static std::string	entropic::spec_init_sampler_and_decoder (SpeculativeRunState &state, llama_model *model_tgt, const GenerationParams &params, int n_draft_max, const std::string &draft_path)
	Initialize the kernel state: clear KV, prefill, sampler, speculative context, batch, and detect FULL-seq_rm checkpoint-mode for target/draft.

static std::string	entropic::spec_init_run (SpeculativeRunState &state, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams &params, int n_draft_max, const std::string &draft_path)
	Initialize speculative run state (prefill + sampler + decoder).

static void	entropic::spec_run_loop (SpeculativeRunState &state, const llama_vocab *vocab, int max_tokens, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel)
	Run the accept-round loop until completion / EOS / cancel.

static GenerationResult	entropic::spec_finalize (SpeculativeRunState &state, std::chrono::steady_clock::time_point t0)
	Speculative kernel against an explicit draft backend.

static GenerationResult	entropic::spec_run_from_tokens (llama_context ctx_tgt, llama_context ctx_dft, llama_model *model_tgt, const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> &on_token, std::atomic< bool > &cancel, int n_draft_max, const std::string &draft_path, std::chrono::steady_clock::time_point t0)
	Public entry point for the speculative-decoding kernel.

Detailed Description

LlamaCppBackend implementation — direct llama.cpp C API.

Pinned against llama.cpp submodule b8420. Uses:

llama_model_load_from_file() for model loading
llama_init_from_model() for context creation
llama_decode() + llama_batch for token processing
llama_sampler_chain for sampling
llama_chat_apply_template() for chat formatting

Version: 1.8.3

Definition in file llama_cpp_backend.cpp.

Classes

Namespaces

Functions

Detailed Description