121 const std::vector<Message>& messages,
134 const std::vector<Message>& messages,
136 std::function<
void(std::string_view token)> on_token,
137 std::atomic<bool>& cancel);
156 const std::vector<Message>& messages,
158 std::function<
void(std::string_view token)> on_token,
159 std::atomic<bool>& cancel);
169 const std::string& prompt,
190 const int32_t* tokens,
205 const int32_t* tokens,
249 const std::string& text)
const {
return {}; }
334 bool save_state(
int seq_id, std::vector<uint8_t>& buffer)
const;
343 bool restore_state(
int seq_id,
const std::vector<uint8_t>& buffer);
372 const std::vector<Message>& messages,
387 const std::vector<Message>& messages,
389 std::function<
void(std::string_view token)> on_token,
390 std::atomic<bool>& cancel);
429 const std::vector<Message>& messages,
442 const std::vector<Message>& messages,
444 std::function<
void(std::string_view token)> on_token,
445 std::atomic<bool>& cancel) = 0;
471 const std::vector<Message>& messages,
473 std::function<
void(std::string_view token)> on_token,
474 std::atomic<bool>& cancel);
484 const std::string& prompt,
511 const int32_t* tokens,
555 std::vector<uint8_t>& buffer)
const;
565 const std::vector<uint8_t>& buffer);
588 const std::vector<Message>& messages,
606 const std::vector<Message>& messages,
608 std::function<
void(std::string_view token)> on_token,
609 std::atomic<bool>& cancel);
627 void set_hooks(
const HookInterface& hooks) { hooks_ = hooks; }
632 std::mutex transition_mutex_;
633 std::mutex eval_mutex_;
634 HookInterface hooks_;
Backend capability flags and metadata for architecture-agnostic queries.
Concrete base class for inference backends (80% logic).
virtual GenerationResult do_complete(const std::string &prompt, const GenerationParams ¶ms)=0
Subclass raw completion.
virtual GenerationResult do_generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with sequence ID.
GenerationResult generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms)
Generate with explicit sequence ID.
virtual LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens)=0
Backend-specific logprob evaluation.
GenerationResult generate_speculative(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate via the speculative-decoding kernel (v2.1.11).
float compute_perplexity(const int32_t *tokens, int n_tokens)
Compute perplexity for a token sequence.
std::string last_error_
Last error message for diagnostics.
virtual BackendInfo do_info() const
Populate backend metadata.
virtual GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)=0
Subclass streaming generation.
bool save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state to buffer.
void set_hooks(const HookInterface &hooks)
Set the hook dispatch interface.
bool supports(BackendCapability cap) const
Query whether this backend supports a capability.
bool restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state from buffer.
bool activate()
Promote to GPU (WARM → ACTIVE).
virtual bool do_restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state.
virtual int do_count_tokens(const std::string &text) const =0
Subclass token counting.
virtual bool do_supports(BackendCapability cap) const
Declare supported capabilities.
void deactivate()
Release GPU layers (ACTIVE → WARM).
virtual void do_unload()=0
Full unload.
virtual bool do_activate()=0
Promote loaded model to GPU.
BackendInfo info() const
Get backend metadata.
bool is_active() const
True when state is ACTIVE.
ModelState state() const
Current lifecycle state (lock-free read).
virtual bool do_load(const ModelConfig &config)=0
Load model into CPU RAM.
virtual GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Subclass speculative-decoding streaming generation.
virtual void clear_prompt_cache()
Invalidate any backend-owned prompt/KV caches.
virtual void do_deactivate()=0
Release GPU, keep CPU.
virtual std::vector< int32_t > tokenize_text(const std::string &text) const
Tokenize text to token IDs.
virtual GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams ¶ms)=0
Subclass generation.
std::vector< BackendCapability > capabilities() const
Get all supported capabilities as a vector.
void unload()
Full unload (→ COLD).
const ModelConfig & config() const
Stored model config.
bool clear_state(int seq_id=-1)
Clear/reset model state for a sequence.
virtual GenerationResult do_generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms)
Generate with sequence ID.
virtual std::string do_backend_name() const =0
Return backend name identifier.
bool is_loaded() const
True when state is WARM or ACTIVE.
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams ¶ms)
Generate a complete response.
bool load(const ModelConfig &config)
Load model into CPU RAM (COLD → WARM).
virtual bool do_clear_state(int seq_id)
Clear/reset model state.
int count_tokens(const std::string &text) const
Count tokens using model's tokenizer.
virtual bool do_save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state (KV cache or hidden state).
bool fire_model_load_hook(const ModelConfig &config)
Fire ON_MODEL_LOAD pre-hook.
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate with per-token streaming callback.
LogprobResult evaluate_logprobs(const int32_t *tokens, int n_tokens)
Evaluate per-token log-probabilities for a token sequence.
int context_length() const
Model's context window size.
bool load_and_activate(const ModelConfig &config)
Convenience: load() + activate().
GenerationResult complete(const std::string &prompt, const GenerationParams ¶ms)
Raw text completion without chat template.
GenerationResult generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with explicit sequence ID.
Configuration structs with defaults.
Generation output with metrics.
Hook dispatch interface injected into engine subsystems.
Per-token log-probability evaluation result.
Message struct for conversation history.
Activate model on GPU (WARM → ACTIVE).
BackendCapability
Capabilities that an inference backend may or may not support.
ModelState
C++ enum class for model VRAM lifecycle states.
@ ACTIVE
GPU layers loaded, full speed.
@ COLD
On disk only, no RAM consumed.
Backend metadata for introspection.
Generation parameters for a single inference call.
Result of a single generation call.
Per-token log-probability evaluation result.
Model configuration for a single tier.
int context_length
Context window size (512–131072)