35 static constexpr const char* names[] = {
"COLD",
"WARM",
"ACTIVE"};
36 int idx =
static_cast<int>(s);
37 return (idx >= 0 && idx <= 2) ? names[idx] :
"UNKNOWN";
55 std::lock_guard<std::mutex> lock(transition_mutex_);
58 logger->info(
"[VRAM] load() no-op: already {}", state_name(
state()));
68 logger->info(
"[VRAM] Loading: {}",
config.
path.string());
69 auto start = entropic::log::now();
74 logger->error(
"[VRAM] Load failed: {}",
last_error_);
77 logger->info(
"[VRAM] Warm in {:.2f}ms", entropic::log::elapsed_ms(start, entropic::log::now()));
89 std::lock_guard<std::mutex> lock(transition_mutex_);
92 logger->info(
"[VRAM] activate() no-op: already ACTIVE");
96 logger->error(
"[VRAM] activate() failed: not WARM ({})", state_name(
state()));
100 logger->info(
"[VRAM] Activating");
101 auto start = entropic::log::now();
104 logger->error(
"[VRAM] Activate failed: {}",
last_error_);
107 logger->info(
"[VRAM] Active in {:.2f}ms", entropic::log::elapsed_ms(start, entropic::log::now()));
118 std::lock_guard<std::mutex> lock(transition_mutex_);
121 logger->info(
"[VRAM] deactivate() no-op: {}", state_name(
state()));
125 logger->info(
"[VRAM] Deactivating");
126 auto start = entropic::log::now();
131 logger->info(
"[VRAM] Deactivated in {:.2f}ms", entropic::log::elapsed_ms(start, entropic::log::now()));
140 std::lock_guard<std::mutex> lock(transition_mutex_);
143 if (hooks_.fire_info !=
nullptr) {
144 std::string json =
"{\"state\":\""
145 + std::string(state_name(
state())) +
"\"}";
146 hooks_.fire_info(hooks_.registry,
150 logger->info(
"[VRAM] Unloading from {}", state_name(
state()));
155 logger->info(
"[VRAM] Unloaded");
183 const std::vector<Message>& messages,
195 auto start = entropic::log::now();
197 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
212 const std::vector<Message>& messages,
214 std::function<
void(std::string_view token)> on_token,
215 std::atomic<bool>& cancel)
220 err.
error_message =
"generate_streaming() requires ACTIVE state";
226 auto start = entropic::log::now();
228 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
249 const std::vector<Message>& messages,
251 std::function<
void(std::string_view token)> on_token,
252 std::atomic<bool>& cancel)
258 "generate_speculative() requires ACTIVE state";
263 auto start = entropic::log::now();
265 messages, params, std::move(on_token), cancel);
266 result.generation_time_ms =
267 entropic::log::elapsed_ms(start, entropic::log::now());
287 const std::vector<Message>& ,
289 std::function<
void(std::string_view)> ,
295 "speculative decoding not implemented for this backend";
309 const std::string& prompt,
321 auto start = entropic::log::now();
323 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
344 const int32_t* tokens,
348 logger->error(
"evaluate_logprobs: model not ACTIVE (state={})",
349 state_name(
state()));
350 throw std::runtime_error(
"Model must be ACTIVE for evaluation");
354 logger->error(
"evaluate_logprobs: need >= 2 tokens, got {}",
356 throw std::runtime_error(
357 "Need at least 2 tokens for logprob evaluation");
360 std::lock_guard<std::mutex> lock(eval_mutex_);
362 logger->info(
"evaluate_logprobs: {} tokens, first=[{},{},{}...]",
364 n_tokens > 1 ? tokens[1] : 0,
365 n_tokens > 2 ? tokens[2] : 0);
366 auto start = entropic::log::now();
378 auto ms = entropic::log::elapsed_ms(start, entropic::log::now());
379 logger->info(
"evaluate_logprobs: perplexity={:.2f}, "
380 "total_lp={:.4f}, {:.2f}ms",
383 logger->info(
" logprob[{}]={:.4f}", i, result.
logprobs[i]);
402 const int32_t* tokens,
418 if (hooks_.fire_pre ==
nullptr) {
421 std::string json =
"{\"model_path\":\""
424 int rc = hooks_.fire_pre(hooks_.registry,
428 logger->info(
"[VRAM] ON_MODEL_LOAD hook cancelled");
446 return static_cast<int>(text.size()) / 4;
469 std::vector<BackendCapability> result;
471 for (
int i = 0; i < count; ++i) {
474 result.push_back(cap);
501 int seq_id, std::vector<uint8_t>& buffer)
const
504 logger->warn(
"save_state: not ACTIVE ({})", state_name(
state()));
507 auto start = entropic::log::now();
510 logger->info(
"save_state: seq={} {}B {:.2f}ms",
511 seq_id, buffer.size(), entropic::log::elapsed_ms(start, entropic::log::now()));
525 int seq_id,
const std::vector<uint8_t>& buffer)
528 logger->warn(
"restore_state: not ACTIVE ({})",
529 state_name(
state()));
532 auto start = entropic::log::now();
535 logger->info(
"restore_state: seq={} {}B {:.2f}ms",
536 seq_id, buffer.size(), entropic::log::elapsed_ms(start, entropic::log::now()));
550 logger->warn(
"clear_state: model is COLD");
555 logger->info(
"clear_state: seq={}", seq_id);
573 const std::vector<Message>& messages,
585 auto start = entropic::log::now();
587 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
588 result.seq_id = seq_id;
605 const std::vector<Message>& messages,
607 std::function<
void(std::string_view token)> on_token,
608 std::atomic<bool>& cancel)
614 "generate_streaming_seq() requires ACTIVE state";
620 auto start = entropic::log::now();
622 seq_id, messages, params, on_token, cancel);
623 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
624 result.seq_id = seq_id;
662 int , std::vector<uint8_t>& )
const
676 int ,
const std::vector<uint8_t>& )
703 const std::vector<Message>& messages,
722 const std::vector<Message>& messages,
724 std::function<
void(std::string_view token)> on_token,
725 std::atomic<bool>& cancel)
virtual GenerationResult do_complete(const std::string &prompt, const GenerationParams ¶ms)=0
Subclass raw completion.
virtual GenerationResult do_generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with sequence ID.
GenerationResult generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms)
Generate with explicit sequence ID.
virtual LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens)=0
Backend-specific logprob evaluation.
GenerationResult generate_speculative(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate via the speculative-decoding kernel (v2.1.11).
float compute_perplexity(const int32_t *tokens, int n_tokens)
Compute perplexity for a token sequence.
std::string last_error_
Last error message for diagnostics.
virtual BackendInfo do_info() const
Populate backend metadata.
virtual GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)=0
Subclass streaming generation.
bool save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state to buffer.
bool supports(BackendCapability cap) const
Query whether this backend supports a capability.
bool restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state from buffer.
bool activate()
Promote to GPU (WARM → ACTIVE).
virtual bool do_restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state.
virtual int do_count_tokens(const std::string &text) const =0
Subclass token counting.
virtual bool do_supports(BackendCapability cap) const
Declare supported capabilities.
void deactivate()
Release GPU layers (ACTIVE → WARM).
virtual void do_unload()=0
Full unload.
virtual bool do_activate()=0
Promote loaded model to GPU.
BackendInfo info() const
Get backend metadata.
bool is_active() const
True when state is ACTIVE.
ModelState state() const
Current lifecycle state (lock-free read).
virtual bool do_load(const ModelConfig &config)=0
Load model into CPU RAM.
virtual GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Subclass speculative-decoding streaming generation.
virtual void do_deactivate()=0
Release GPU, keep CPU.
virtual GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams ¶ms)=0
Subclass generation.
std::vector< BackendCapability > capabilities() const
Get all supported capabilities as a vector.
void unload()
Full unload (→ COLD).
const ModelConfig & config() const
Stored model config.
bool clear_state(int seq_id=-1)
Clear/reset model state for a sequence.
virtual GenerationResult do_generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms)
Generate with sequence ID.
virtual std::string do_backend_name() const =0
Return backend name identifier.
bool is_loaded() const
True when state is WARM or ACTIVE.
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams ¶ms)
Generate a complete response.
bool load(const ModelConfig &config)
Load model into CPU RAM (COLD → WARM).
virtual bool do_clear_state(int seq_id)
Clear/reset model state.
int count_tokens(const std::string &text) const
Count tokens using model's tokenizer.
virtual bool do_save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state (KV cache or hidden state).
bool fire_model_load_hook(const ModelConfig &config)
Fire ON_MODEL_LOAD pre-hook.
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate with per-token streaming callback.
LogprobResult evaluate_logprobs(const int32_t *tokens, int n_tokens)
Evaluate per-token log-probabilities for a token sequence.
bool load_and_activate(const ModelConfig &config)
Convenience: load() + activate().
GenerationResult complete(const std::string &prompt, const GenerationParams ¶ms)
Raw text completion without chat template.
GenerationResult generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with explicit sequence ID.
@ ENTROPIC_ERROR_NOT_SUPPORTED
Capability not supported by this backend (v1.9.13)
@ ENTROPIC_ERROR_INVALID_STATE
Operation not valid in current state (e.g., generate before activate)
@ ENTROPIC_HOOK_ON_MODEL_UNLOAD
14: Model unloaded from backend
@ ENTROPIC_HOOK_ON_MODEL_LOAD
13: Model loaded into backend
InferenceBackend concrete base class.
spdlog initialization and logger access.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Activate model on GPU (WARM → ACTIVE).
BackendCapability
Capabilities that an inference backend may or may not support.
@ _COUNT
Sentinel — must be last. Used for iteration/array sizing.
@ ok
Tool dispatched, returned non-empty content.
ModelState
C++ enum class for model VRAM lifecycle states.
@ WARM
mmap'd + mlock'd in RAM
@ ACTIVE
GPU layers loaded, full speed.
@ COLD
On disk only, no RAM consumed.
Backend metadata for introspection.
std::string name
Backend identifier (e.g. "llama.cpp", "axcl")
Generation parameters for a single inference call.
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string error_message
Error description (empty if no error)
Per-token log-probability evaluation result.
std::vector< float > logprobs
Log-prob for each token transition (N-1 values)
int n_logprobs
Number of logprob values (n_tokens - 1)
float total_logprob
Sum of all logprob values.
float perplexity
exp(-mean(logprobs)) — lower = less surprising
Model configuration for a single tier.
std::filesystem::path path
Resolved model file path.