Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
llama_cpp_backend.h
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
25#pragma once
26
28
29#include "prompt_cache.h"
30
31#include <llama.h>
32
33#include <atomic>
34#include <chrono>
35#include <cstdint>
36#include <functional>
37#include <memory>
38#include <mutex>
39#include <string>
40#include <vector>
41
42// Forward-declare libmtmd's opaque types at file scope so they
43// resolve to ::mtmd_context / ::mtmd_bitmap (not entropic::mtmd_*)
44// when referenced inside the class body. Full types live in
45// extern/llama.cpp/tools/mtmd/mtmd.h and are only included from
46// the implementation file. (v2.1.8, gh#37/v1.9.11 Phase 5)
47extern "C" {
48struct mtmd_context;
49struct mtmd_bitmap;
50}
51
52namespace entropic {
53
64public:
79 ~LlamaCppBackend() override;
80
94
105 void clear_prompt_cache() override {
106 if (prompt_cache_) { prompt_cache_->clear(); }
107 }
108
115 std::vector<int32_t> tokenize_text(
116 const std::string& text) const override;
117
118 /* ── llama.cpp handle accessors (v1.9.2) ────────────── */
119
126 llama_model* llama_model_ptr() { return model_; }
127
134 llama_context* llama_context_ptr() { return ctx_; }
135
136protected:
137 /* ── Lifecycle overrides ─────────────────────────────── */
138
139 bool do_load(const ModelConfig& config) override;
140 bool do_activate() override;
141 void do_deactivate() override;
142 void do_unload() override;
143
144 /* ── Generation overrides ────────────────────────────── */
145
147 const std::vector<Message>& messages,
148 const GenerationParams& params) override;
149
151 const std::vector<Message>& messages,
152 const GenerationParams& params,
153 std::function<void(std::string_view token)> on_token,
154 std::atomic<bool>& cancel) override;
155
170 const std::vector<Message>& messages,
171 const GenerationParams& params,
172 std::function<void(std::string_view token)> on_token,
173 std::atomic<bool>& cancel) override;
174
175public:
210 const std::vector<Message>& messages,
211 const GenerationParams& params,
212 std::function<void(std::string_view token)> on_token,
213 std::atomic<bool>& cancel,
214 LlamaCppBackend& draft,
215 int n_draft_max,
216 const std::string& draft_path);
217
218protected:
220 const std::string& prompt,
221 const GenerationParams& params) override;
222
223 int do_count_tokens(const std::string& text) const override;
224
225 /* ── Evaluation override (v1.9.10) ──────────────────── */
226
228 const int32_t* tokens,
229 int n_tokens) override;
230
231 /* ── Capability overrides (v1.9.13) ──────────────────── */
232
233 bool do_supports(BackendCapability cap) const override;
234 std::string do_backend_name() const override;
235 BackendInfo do_info() const override;
236 bool do_clear_state(int seq_id) override;
237
238 /* ── llama.cpp handles ───────────────────────────────── */
239
240 llama_model* model_ = nullptr;
241 llama_context* ctx_ = nullptr;
242 const llama_vocab* vocab_ = nullptr;
243
244 /* ── Prompt cache ───────────────────────────────────── */
245
247 std::unique_ptr<PromptCache> prompt_cache_;
248
249 /* ── Internal helpers ────────────────────────────────── */
250
258 std::vector<llama_token> tokenize(
259 const std::string& text, bool add_special) const;
260
267 std::string detokenize(llama_token token) const;
268
276 std::string apply_chat_template(
277 const std::vector<Message>& messages,
278 const GenerationParams& params) const;
279
290 const std::vector<llama_token>& tokens,
291 const GenerationParams& params,
292 std::function<void(std::string_view)> on_token,
293 std::atomic<bool>* cancel);
294
301 bool run_prefill(const std::vector<llama_token>& tokens);
302
312 std::string step_token(
313 llama_sampler* sampler,
314 std::string& generated,
315 std::function<void(std::string_view)>& on_token,
316 const std::vector<std::string>& stop);
317
324 llama_sampler* create_sampler(const GenerationParams& params) const;
325
332 static std::string extract_system_prompt(
333 const std::vector<Message>& messages);
334
345 const std::vector<llama_token>& tokens,
346 const std::string& system_prompt,
347 const std::vector<Message>& messages,
348 const GenerationParams& params);
349
358 const std::vector<llama_token>& tokens, int start_offset);
359
368 const CacheEntry* cached,
369 const std::vector<llama_token>& tokens);
370
380 const std::vector<llama_token>& tokens,
381 int prefix_tokens,
382 const CacheKey& key);
383
390 void save_prefix_to_cache(const CacheKey& key, int prefix_tokens);
391
400 const std::vector<Message>& messages,
401 const GenerationParams& params);
402
403 /* ── Evaluation helpers (v1.9.10) ───────────────────── */
404
410 llama_seq_id allocate_temp_seq_id();
411
417 void release_temp_seq_id(llama_seq_id seq_id);
418
431 static float extract_token_logprob(
432 const float* logits,
433 int32_t next_token,
434 int n_vocab);
435
436 std::mutex seq_id_mutex_;
437 std::vector<llama_seq_id> free_seq_ids_;
438
439 /* ── Architecture detection (v1.9.13) ──────────────── */
440
446 bool is_recurrent_ = false;
447
453 bool is_recurrent() const;
454
455 /* ── Vision / multimodal (v1.9.11 Phases 5–7 + v2.1.8) ── */
456
464 ::mtmd_context* mtmd_ctx_ = nullptr;
465
468 bool has_vision_ = false;
469
490 const std::vector<Message>& messages,
491 const GenerationParams& params,
492 std::function<void(std::string_view token)> on_token,
493 std::atomic<bool>* cancel);
494
501
512 bool load_gpu_model();
513
525
536 const std::string& prompt,
537 const std::vector<::mtmd_bitmap*>& bitmaps,
538 std::string& err_msg);
539
555 const GenerationParams& params,
556 std::function<void(std::string_view token)> on_token,
557 std::atomic<bool>* cancel,
558 const std::chrono::steady_clock::time_point& t0);
559
566 const std::vector<Message>& messages,
567 const GenerationParams& params);
568
575 const std::vector<Message>& messages,
576 const GenerationParams& params,
577 std::function<void(std::string_view token)> on_token,
578 std::atomic<bool>& cancel);
579};
580
581} // namespace entropic
Concrete base class for inference backends (80% logic).
Definition backend.h:69
const ModelConfig & config() const
Stored model config.
Definition backend.h:278
LlamaCppBackend — common llama.cpp patterns (15% layer).
bool load_gpu_model()
Load the GGUF model onto the GPU (do_activate step 1).
bool do_load(const ModelConfig &config) override
Load model into CPU RAM (COLD → WARM).
bool do_supports(BackendCapability cap) const override
Declare llama.cpp backend capabilities.
GenerationResult decode_loop(const std::vector< llama_token > &tokens, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
Core decode loop — shared by generate and streaming.
void clear_prompt_cache() override
Drop every cached prefix so the next prefill re-seeds.
bool is_recurrent_
True if loaded model is recurrent (GDN/Mamba/RWKV).
LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens) override
Evaluate per-token log-probabilities via sequential decode.
std::string do_backend_name() const override
Return backend name.
std::unique_ptr< PromptCache > prompt_cache_
KV prefix cache (v1.8.3)
llama_context * llama_context_ptr()
Get the active llama_context pointer.
std::string step_token(llama_sampler *sampler, std::string &generated, std::function< void(std::string_view)> &on_token, const std::vector< std::string > &stop)
Generate one token and append to output.
GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams &params) override
Generate a complete response using chat template.
void set_prompt_cache_config(const PromptCacheConfig &config)
Set prompt cache configuration.
GenerationResult do_complete(const std::string &prompt, const GenerationParams &params) override
Raw text completion without chat template.
std::vector< llama_token > tokenize(const std::string &text, bool add_special) const
Tokenize text using model vocabulary.
bool create_inference_context()
Create the llama context + prompt cache (do_activate step 2).
const llama_vocab * vocab_
Vocabulary (from model_)
int compute_prefix_token_count(const std::vector< Message > &messages, const GenerationParams &params)
Compute token count of system messages only.
std::string detokenize(llama_token token) const
Detokenize a single token.
void init_mmproj_if_configured()
Initialize the libmtmd context if mmproj is configured.
GenerationResult generate_speculative_with_draft(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, LlamaCppBackend &draft, int n_draft_max, const std::string &draft_path)
Speculative-decoding kernel with explicit draft backend.
llama_context * ctx_
Inference context (ACTIVE)
bool run_prefill(const std::vector< llama_token > &tokens)
Run batched prefill on input tokens.
GenerationResult run_sampling_loop(const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel, const std::chrono::steady_clock::time_point &t0)
Sample tokens until stop / max_tokens / cancel.
bool restore_cached_prefix(const CacheEntry *cached, const std::vector< llama_token > &tokens)
Restore KV state from cache and decode remaining tokens.
void save_prefix_to_cache(const CacheKey &key, int prefix_tokens)
Capture seq 0 KV state and store under the given key.
std::vector< int32_t > tokenize_text(const std::string &text) const override
Tokenize text to token IDs using model vocabulary.
bool is_recurrent() const
Check if loaded model is recurrent.
entropic_error_t mtmd_prefill(const std::string &prompt, const std::vector<::mtmd_bitmap * > &bitmaps, std::string &err_msg)
Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.
bool run_prefill_cached(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams &params)
Run prefill with prompt cache integration.
GenerationResult do_generate_text_only(const std::vector< Message > &messages, const GenerationParams &params)
Text-only batch generation (extracted from do_generate).
std::string apply_chat_template(const std::vector< Message > &messages, const GenerationParams &params) const
Apply chat template to messages.
GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Streaming generation with per-token callback.
bool has_vision_
Cached mtmd_support_vision(mtmd_ctx_) result.
bool decode_tokens_from(const std::vector< llama_token > &tokens, int start_offset)
Decode tokens starting at a given offset.
void release_temp_seq_id(llama_seq_id seq_id)
Release a temporary sequence ID back to the pool.
llama_model * llama_model_ptr()
Get the loaded llama_model pointer.
int do_count_tokens(const std::string &text) const override
Count tokens in text.
::mtmd_context * mtmd_ctx_
libmtmd context, or nullptr if no mmproj loaded.
GenerationResult generate_multimodal(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel)
Multimodal generation core (v1.9.11 Phases 5–7).
std::mutex seq_id_mutex_
Guards temp seq_id pool (v1.9.10)
bool do_clear_state(int seq_id) override
Clear KV cache or recurrent hidden state.
static float extract_token_logprob(const float *logits, int32_t next_token, int n_vocab)
Extract log-probability for a token from logits.
void do_deactivate() override
Deactivate: free context, reload model CPU-only.
llama_sampler * create_sampler(const GenerationParams &params) const
Create sampler chain from generation params.
BackendInfo do_info() const override
Populate backend metadata from llama.cpp model.
bool do_activate() override
Activate model on GPU (WARM → ACTIVE).
bool prefill_and_cache_prefix(const std::vector< llama_token > &tokens, int prefix_tokens, const CacheKey &key)
Two-pass prefill: prefix-only prefill → save → rest.
llama_seq_id allocate_temp_seq_id()
Allocate a temporary sequence ID for evaluation.
PromptCacheConfig prompt_cache_config_
Cache config (v1.8.3)
void do_unload() override
Full unload — free all resources, clear prompt cache.
llama_model * model_
Loaded model (WARM+)
~LlamaCppBackend() override
Free llama.cpp + mtmd resources on destruction.
std::vector< llama_seq_id > free_seq_ids_
Available temporary seq_ids (v1.9.10)
GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel...
GenerationResult do_generate_streaming_text_only(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Text-only streaming generation (extracted from streaming).
static std::string extract_system_prompt(const std::vector< Message > &messages)
Extract the system prompt from messages.
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35
InferenceBackend concrete base class.
Activate model on GPU (WARM → ACTIVE).
BackendCapability
Capabilities that an inference backend may or may not support.
Host-memory KV cache state storage with LRU eviction.
Backend metadata for introspection.
Single cached KV state snapshot.
64-bit hash used as cache lookup key.
Generation parameters for a single inference call.
Definition config.h:227
Result of a single generation call.
Per-token log-probability evaluation result.
Model configuration for a single tier.
Definition config.h:148
Prompt caching configuration.
Definition config.h:196