116 const std::string& text)
const override;
147 const std::vector<Message>& messages,
151 const std::vector<Message>& messages,
153 std::function<
void(std::string_view token)> on_token,
154 std::atomic<bool>& cancel)
override;
170 const std::vector<Message>& messages,
172 std::function<
void(std::string_view token)> on_token,
173 std::atomic<bool>& cancel)
override;
210 const std::vector<Message>& messages,
212 std::function<
void(std::string_view token)> on_token,
213 std::atomic<bool>& cancel,
216 const std::string& draft_path);
220 const std::string& prompt,
228 const int32_t* tokens,
229 int n_tokens)
override;
259 const std::string& text,
bool add_special)
const;
267 std::string
detokenize(llama_token token)
const;
277 const std::vector<Message>& messages,
290 const std::vector<llama_token>& tokens,
292 std::function<
void(std::string_view)> on_token,
293 std::atomic<bool>* cancel);
301 bool run_prefill(
const std::vector<llama_token>& tokens);
313 llama_sampler* sampler,
314 std::string& generated,
315 std::function<
void(std::string_view)>& on_token,
316 const std::vector<std::string>& stop);
333 const std::vector<Message>& messages);
345 const std::vector<llama_token>& tokens,
346 const std::string& system_prompt,
347 const std::vector<Message>& messages,
358 const std::vector<llama_token>& tokens,
int start_offset);
369 const std::vector<llama_token>& tokens);
380 const std::vector<llama_token>& tokens,
400 const std::vector<Message>& messages,
490 const std::vector<Message>& messages,
492 std::function<
void(std::string_view token)> on_token,
493 std::atomic<bool>* cancel);
536 const std::string& prompt,
537 const std::vector<::mtmd_bitmap*>& bitmaps,
538 std::string& err_msg);
556 std::function<
void(std::string_view token)> on_token,
557 std::atomic<bool>* cancel,
558 const std::chrono::steady_clock::time_point& t0);
566 const std::vector<Message>& messages,
575 const std::vector<Message>& messages,
577 std::function<
void(std::string_view token)> on_token,
578 std::atomic<bool>& cancel);
Concrete base class for inference backends (80% logic).
const ModelConfig & config() const
Stored model config.
LlamaCppBackend — common llama.cpp patterns (15% layer).
bool load_gpu_model()
Load the GGUF model onto the GPU (do_activate step 1).
bool do_load(const ModelConfig &config) override
Load model into CPU RAM (COLD → WARM).
bool do_supports(BackendCapability cap) const override
Declare llama.cpp backend capabilities.
GenerationResult decode_loop(const std::vector< llama_token > &tokens, const GenerationParams ¶ms, std::function< void(std::string_view)> on_token, std::atomic< bool > *cancel)
Core decode loop — shared by generate and streaming.
void clear_prompt_cache() override
Drop every cached prefix so the next prefill re-seeds.
bool is_recurrent_
True if loaded model is recurrent (GDN/Mamba/RWKV).
LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens) override
Evaluate per-token log-probabilities via sequential decode.
std::string do_backend_name() const override
Return backend name.
std::unique_ptr< PromptCache > prompt_cache_
KV prefix cache (v1.8.3)
llama_context * llama_context_ptr()
Get the active llama_context pointer.
std::string step_token(llama_sampler *sampler, std::string &generated, std::function< void(std::string_view)> &on_token, const std::vector< std::string > &stop)
Generate one token and append to output.
GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams ¶ms) override
Generate a complete response using chat template.
void set_prompt_cache_config(const PromptCacheConfig &config)
Set prompt cache configuration.
GenerationResult do_complete(const std::string &prompt, const GenerationParams ¶ms) override
Raw text completion without chat template.
std::vector< llama_token > tokenize(const std::string &text, bool add_special) const
Tokenize text using model vocabulary.
bool create_inference_context()
Create the llama context + prompt cache (do_activate step 2).
const llama_vocab * vocab_
Vocabulary (from model_)
int compute_prefix_token_count(const std::vector< Message > &messages, const GenerationParams ¶ms)
Compute token count of system messages only.
std::string detokenize(llama_token token) const
Detokenize a single token.
void init_mmproj_if_configured()
Initialize the libmtmd context if mmproj is configured.
GenerationResult generate_speculative_with_draft(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel, LlamaCppBackend &draft, int n_draft_max, const std::string &draft_path)
Speculative-decoding kernel with explicit draft backend.
llama_context * ctx_
Inference context (ACTIVE)
bool run_prefill(const std::vector< llama_token > &tokens)
Run batched prefill on input tokens.
GenerationResult run_sampling_loop(const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel, const std::chrono::steady_clock::time_point &t0)
Sample tokens until stop / max_tokens / cancel.
bool restore_cached_prefix(const CacheEntry *cached, const std::vector< llama_token > &tokens)
Restore KV state from cache and decode remaining tokens.
void save_prefix_to_cache(const CacheKey &key, int prefix_tokens)
Capture seq 0 KV state and store under the given key.
std::vector< int32_t > tokenize_text(const std::string &text) const override
Tokenize text to token IDs using model vocabulary.
bool is_recurrent() const
Check if loaded model is recurrent.
entropic_error_t mtmd_prefill(const std::string &prompt, const std::vector<::mtmd_bitmap * > &bitmaps, std::string &err_msg)
Run mtmd_tokenize + mtmd_helper_eval_chunks on a prompt.
bool run_prefill_cached(const std::vector< llama_token > &tokens, const std::string &system_prompt, const std::vector< Message > &messages, const GenerationParams ¶ms)
Run prefill with prompt cache integration.
GenerationResult do_generate_text_only(const std::vector< Message > &messages, const GenerationParams ¶ms)
Text-only batch generation (extracted from do_generate).
std::string apply_chat_template(const std::vector< Message > &messages, const GenerationParams ¶ms) const
Apply chat template to messages.
GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Streaming generation with per-token callback.
bool has_vision_
Cached mtmd_support_vision(mtmd_ctx_) result.
bool decode_tokens_from(const std::vector< llama_token > &tokens, int start_offset)
Decode tokens starting at a given offset.
void release_temp_seq_id(llama_seq_id seq_id)
Release a temporary sequence ID back to the pool.
llama_model * llama_model_ptr()
Get the loaded llama_model pointer.
int do_count_tokens(const std::string &text) const override
Count tokens in text.
::mtmd_context * mtmd_ctx_
libmtmd context, or nullptr if no mmproj loaded.
GenerationResult generate_multimodal(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > *cancel)
Multimodal generation core (v1.9.11 Phases 5–7).
std::mutex seq_id_mutex_
Guards temp seq_id pool (v1.9.10)
bool do_clear_state(int seq_id) override
Clear KV cache or recurrent hidden state.
static float extract_token_logprob(const float *logits, int32_t next_token, int n_vocab)
Extract log-probability for a token from logits.
void do_deactivate() override
Deactivate: free context, reload model CPU-only.
llama_sampler * create_sampler(const GenerationParams ¶ms) const
Create sampler chain from generation params.
BackendInfo do_info() const override
Populate backend metadata from llama.cpp model.
bool do_activate() override
Activate model on GPU (WARM → ACTIVE).
bool prefill_and_cache_prefix(const std::vector< llama_token > &tokens, int prefix_tokens, const CacheKey &key)
Two-pass prefill: prefix-only prefill → save → rest.
llama_seq_id allocate_temp_seq_id()
Allocate a temporary sequence ID for evaluation.
PromptCacheConfig prompt_cache_config_
Cache config (v1.8.3)
void do_unload() override
Full unload — free all resources, clear prompt cache.
llama_model * model_
Loaded model (WARM+)
~LlamaCppBackend() override
Free llama.cpp + mtmd resources on destruction.
std::vector< llama_seq_id > free_seq_ids_
Available temporary seq_ids (v1.9.10)
GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel) override
Speculative streaming via the abstract InferenceBackend interface (kept as NOT_SUPPORTED — see kernel...
GenerationResult do_generate_streaming_text_only(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Text-only streaming generation (extracted from streaming).
static std::string extract_system_prompt(const std::vector< Message > &messages)
Extract the system prompt from messages.
entropic_error_t
Error codes returned by all C API functions.
InferenceBackend concrete base class.
Activate model on GPU (WARM → ACTIVE).
BackendCapability
Capabilities that an inference backend may or may not support.
Host-memory KV cache state storage with LRU eviction.
Backend metadata for introspection.
Single cached KV state snapshot.
64-bit hash used as cache lookup key.
Generation parameters for a single inference call.
Result of a single generation call.
Per-token log-probability evaluation result.
Model configuration for a single tier.
Prompt caching configuration.