Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
backend.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
15
16#include <cmath>
17#include <cstdlib>
18#include <stdexcept>
19#include <string>
20
21namespace entropic {
22
23namespace {
24
25auto logger = entropic::log::get("inference.backend");
26
34const char* state_name(ModelState s) {
35 static constexpr const char* names[] = {"COLD", "WARM", "ACTIVE"};
36 int idx = static_cast<int>(s);
37 return (idx >= 0 && idx <= 2) ? names[idx] : "UNKNOWN";
38}
39
40} // anonymous namespace
41
42// ── Lifecycle ──────────────────────────────────────────────
43
55 std::lock_guard<std::mutex> lock(transition_mutex_);
56
57 if (state() != ModelState::COLD) {
58 logger->info("[VRAM] load() no-op: already {}", state_name(state()));
59 return true;
60 }
61
62 // Hook: ON_MODEL_LOAD — can cancel (v1.9.1)
63 bool cancelled = fire_model_load_hook(config);
64 if (cancelled) {
65 return false;
66 }
67
68 logger->info("[VRAM] Loading: {}", config.path.string());
69 auto start = entropic::log::now();
70
71 config_ = config;
72 bool ok = do_load(config);
73 if (!ok) {
74 logger->error("[VRAM] Load failed: {}", last_error_);
75 } else {
76 state_.store(ModelState::WARM, std::memory_order_release);
77 logger->info("[VRAM] Warm in {:.2f}ms", entropic::log::elapsed_ms(start, entropic::log::now()));
78 }
79 return ok;
80}
81
89 std::lock_guard<std::mutex> lock(transition_mutex_);
90
91 if (state() == ModelState::ACTIVE) {
92 logger->info("[VRAM] activate() no-op: already ACTIVE");
93 return true;
94 }
95 if (state() != ModelState::WARM) {
96 logger->error("[VRAM] activate() failed: not WARM ({})", state_name(state()));
97 return false;
98 }
99
100 logger->info("[VRAM] Activating");
101 auto start = entropic::log::now();
102 bool ok = do_activate();
103 if (!ok) {
104 logger->error("[VRAM] Activate failed: {}", last_error_);
105 } else {
106 state_.store(ModelState::ACTIVE, std::memory_order_release);
107 logger->info("[VRAM] Active in {:.2f}ms", entropic::log::elapsed_ms(start, entropic::log::now()));
108 }
109 return ok;
110}
111
118 std::lock_guard<std::mutex> lock(transition_mutex_);
119
120 if (state() != ModelState::ACTIVE) {
121 logger->info("[VRAM] deactivate() no-op: {}", state_name(state()));
122 return;
123 }
124
125 logger->info("[VRAM] Deactivating");
126 auto start = entropic::log::now();
127
129 state_.store(ModelState::WARM, std::memory_order_release);
130
131 logger->info("[VRAM] Deactivated in {:.2f}ms", entropic::log::elapsed_ms(start, entropic::log::now()));
132}
133
140 std::lock_guard<std::mutex> lock(transition_mutex_);
141
142 // Hook: ON_MODEL_UNLOAD — informational (v1.9.1)
143 if (hooks_.fire_info != nullptr) {
144 std::string json = "{\"state\":\""
145 + std::string(state_name(state())) + "\"}";
146 hooks_.fire_info(hooks_.registry,
147 ENTROPIC_HOOK_ON_MODEL_UNLOAD, json.c_str());
148 }
149
150 logger->info("[VRAM] Unloading from {}", state_name(state()));
151
152 do_unload();
153 state_.store(ModelState::COLD, std::memory_order_release);
154
155 logger->info("[VRAM] Unloaded");
156}
157
166 if (!load(config)) {
167 return false;
168 }
169 return activate();
170}
171
172// ── Generation ─────────────────────────────────────────────
173
183 const std::vector<Message>& messages,
184 const GenerationParams& params)
185{
186 if (!is_active()) {
189 err.error_message = "generate() requires ACTIVE state";
190 err.finish_reason = "error";
191 logger->error("{}", err.error_message);
192 return err;
193 }
194
195 auto start = entropic::log::now();
196 auto result = do_generate(messages, params);
197 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
198 return result;
199}
200
212 const std::vector<Message>& messages,
213 const GenerationParams& params,
214 std::function<void(std::string_view token)> on_token,
215 std::atomic<bool>& cancel)
216{
217 if (!is_active()) {
220 err.error_message = "generate_streaming() requires ACTIVE state";
221 err.finish_reason = "error";
222 logger->error("{}", err.error_message);
223 return err;
224 }
225
226 auto start = entropic::log::now();
227 auto result = do_generate_streaming(messages, params, on_token, cancel);
228 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
229 return result;
230}
231
249 const std::vector<Message>& messages,
250 const GenerationParams& params,
251 std::function<void(std::string_view token)> on_token,
252 std::atomic<bool>& cancel)
253{
254 if (!is_active()) {
257 err.error_message =
258 "generate_speculative() requires ACTIVE state";
259 err.finish_reason = "error";
260 logger->error("{}", err.error_message);
261 return err;
262 }
263 auto start = entropic::log::now();
264 auto result = do_generate_speculative(
265 messages, params, std::move(on_token), cancel);
266 result.generation_time_ms =
267 entropic::log::elapsed_ms(start, entropic::log::now());
268 return result;
269}
270
287 const std::vector<Message>& /*messages*/,
288 const GenerationParams& /*params*/,
289 std::function<void(std::string_view)> /*on_token*/,
290 std::atomic<bool>& /*cancel*/)
291{
292 GenerationResult result;
294 result.error_message =
295 "speculative decoding not implemented for this backend";
296 result.finish_reason = "error";
297 return result;
298}
299
309 const std::string& prompt,
310 const GenerationParams& params)
311{
312 if (!is_active()) {
315 err.error_message = "complete() requires ACTIVE state";
316 err.finish_reason = "error";
317 logger->error("{}", err.error_message);
318 return err;
319 }
320
321 auto start = entropic::log::now();
322 auto result = do_complete(prompt, params);
323 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
324 return result;
325}
326
327// ── Evaluation (v1.9.10) ───────────────────────────────────
328
344 const int32_t* tokens,
345 int n_tokens)
346{
347 if (!is_active()) {
348 logger->error("evaluate_logprobs: model not ACTIVE (state={})",
349 state_name(state()));
350 throw std::runtime_error("Model must be ACTIVE for evaluation");
351 }
352
353 if (n_tokens < 2) {
354 logger->error("evaluate_logprobs: need >= 2 tokens, got {}",
355 n_tokens);
356 throw std::runtime_error(
357 "Need at least 2 tokens for logprob evaluation");
358 }
359
360 std::lock_guard<std::mutex> lock(eval_mutex_);
361
362 logger->info("evaluate_logprobs: {} tokens, first=[{},{},{}...]",
363 n_tokens, tokens[0],
364 n_tokens > 1 ? tokens[1] : 0,
365 n_tokens > 2 ? tokens[2] : 0);
366 auto start = entropic::log::now();
367
368 LogprobResult result = do_evaluate_logprobs(tokens, n_tokens);
369
370 result.total_logprob = 0.0f;
371 for (float lp : result.logprobs) {
372 result.total_logprob += lp;
373 }
374 float mean_lp = result.total_logprob /
375 static_cast<float>(result.n_logprobs);
376 result.perplexity = std::exp(-mean_lp);
377
378 auto ms = entropic::log::elapsed_ms(start, entropic::log::now());
379 logger->info("evaluate_logprobs: perplexity={:.2f}, "
380 "total_lp={:.4f}, {:.2f}ms",
381 result.perplexity, result.total_logprob, ms);
382 for (int i = 0; i < result.n_logprobs; ++i) {
383 logger->info(" logprob[{}]={:.4f}", i, result.logprobs[i]);
384 }
385
386 return result;
387}
388
402 const int32_t* tokens,
403 int n_tokens)
404{
405 return evaluate_logprobs(tokens, n_tokens).perplexity;
406}
407
408// ── Hook helpers (v1.9.1) ──────────────────────────────────
409
418 if (hooks_.fire_pre == nullptr) {
419 return false;
420 }
421 std::string json = "{\"model_path\":\""
422 + config.path.string() + "\"}";
423 char* mod = nullptr;
424 int rc = hooks_.fire_pre(hooks_.registry,
425 ENTROPIC_HOOK_ON_MODEL_LOAD, json.c_str(), &mod);
426 free(mod);
427 if (rc != 0) {
428 logger->info("[VRAM] ON_MODEL_LOAD hook cancelled");
429 }
430 return rc != 0;
431}
432
433// ── Queries ────────────────────────────────────────────────
434
442int InferenceBackend::count_tokens(const std::string& text) const {
443 if (is_loaded()) {
444 return do_count_tokens(text);
445 }
446 return static_cast<int>(text.size()) / 4;
447}
448
449// ── Capability queries (v1.9.13) ───────────────────────────
450
459 return do_supports(cap);
460}
461
468std::vector<BackendCapability> InferenceBackend::capabilities() const {
469 std::vector<BackendCapability> result;
470 int count = static_cast<int>(BackendCapability::_COUNT);
471 for (int i = 0; i < count; ++i) {
472 auto cap = static_cast<BackendCapability>(i);
473 if (supports(cap)) {
474 result.push_back(cap);
475 }
476 }
477 return result;
478}
479
487 return do_info();
488}
489
490// ── Model state management (v1.9.13) ──────────────────────
491
501 int seq_id, std::vector<uint8_t>& buffer) const
502{
503 if (!is_active()) {
504 logger->warn("save_state: not ACTIVE ({})", state_name(state()));
505 return false;
506 }
507 auto start = entropic::log::now();
508 bool ok = do_save_state(seq_id, buffer);
509 if (ok) {
510 logger->info("save_state: seq={} {}B {:.2f}ms",
511 seq_id, buffer.size(), entropic::log::elapsed_ms(start, entropic::log::now()));
512 }
513 return ok;
514}
515
525 int seq_id, const std::vector<uint8_t>& buffer)
526{
527 if (!is_active()) {
528 logger->warn("restore_state: not ACTIVE ({})",
529 state_name(state()));
530 return false;
531 }
532 auto start = entropic::log::now();
533 bool ok = do_restore_state(seq_id, buffer);
534 if (ok) {
535 logger->info("restore_state: seq={} {}B {:.2f}ms",
536 seq_id, buffer.size(), entropic::log::elapsed_ms(start, entropic::log::now()));
537 }
538 return ok;
539}
540
549 if (state() == ModelState::COLD) {
550 logger->warn("clear_state: model is COLD");
551 return false;
552 }
553 bool ok = do_clear_state(seq_id);
554 if (ok) {
555 logger->info("clear_state: seq={}", seq_id);
556 }
557 return ok;
558}
559
560// ── Multi-sequence generation (v1.9.13) ────────────────────
561
572 int seq_id,
573 const std::vector<Message>& messages,
574 const GenerationParams& params)
575{
576 if (!is_active()) {
579 err.error_message = "generate_seq() requires ACTIVE state";
580 err.finish_reason = "error";
581 logger->error("{}", err.error_message);
582 return err;
583 }
584
585 auto start = entropic::log::now();
586 auto result = do_generate_seq(seq_id, messages, params);
587 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
588 result.seq_id = seq_id;
589 return result;
590}
591
604 int seq_id,
605 const std::vector<Message>& messages,
606 const GenerationParams& params,
607 std::function<void(std::string_view token)> on_token,
608 std::atomic<bool>& cancel)
609{
610 if (!is_active()) {
613 err.error_message =
614 "generate_streaming_seq() requires ACTIVE state";
615 err.finish_reason = "error";
616 logger->error("{}", err.error_message);
617 return err;
618 }
619
620 auto start = entropic::log::now();
621 auto result = do_generate_streaming_seq(
622 seq_id, messages, params, on_token, cancel);
623 result.generation_time_ms = entropic::log::elapsed_ms(start, entropic::log::now());
624 result.seq_id = seq_id;
625 return result;
626}
627
628// ── Default virtual implementations (v1.9.13) ─────────────
629
638 return false;
639}
640
648 BackendInfo bi;
649 bi.name = do_backend_name();
650 return bi;
651}
652
662 int /*seq_id*/, std::vector<uint8_t>& /*buffer*/) const
663{
664 return false;
665}
666
676 int /*seq_id*/, const std::vector<uint8_t>& /*buffer*/)
677{
678 return false;
679}
680
689 return false;
690}
691
702 int /*seq_id*/,
703 const std::vector<Message>& messages,
704 const GenerationParams& params)
705{
706 return do_generate(messages, params);
707}
708
721 int /*seq_id*/,
722 const std::vector<Message>& messages,
723 const GenerationParams& params,
724 std::function<void(std::string_view token)> on_token,
725 std::atomic<bool>& cancel)
726{
727 return do_generate_streaming(messages, params, on_token, cancel);
728}
729
730} // namespace entropic
virtual GenerationResult do_complete(const std::string &prompt, const GenerationParams &params)=0
Subclass raw completion.
virtual GenerationResult do_generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with sequence ID.
Definition backend.cpp:720
GenerationResult generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params)
Generate with explicit sequence ID.
Definition backend.cpp:571
virtual LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens)=0
Backend-specific logprob evaluation.
GenerationResult generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate via the speculative-decoding kernel (v2.1.11).
Definition backend.cpp:248
float compute_perplexity(const int32_t *tokens, int n_tokens)
Compute perplexity for a token sequence.
Definition backend.cpp:401
std::string last_error_
Last error message for diagnostics.
Definition backend.h:611
virtual BackendInfo do_info() const
Populate backend metadata.
Definition backend.cpp:647
virtual GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)=0
Subclass streaming generation.
bool save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state to buffer.
Definition backend.cpp:500
bool supports(BackendCapability cap) const
Query whether this backend supports a capability.
Definition backend.cpp:458
bool restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state from buffer.
Definition backend.cpp:524
bool activate()
Promote to GPU (WARM → ACTIVE).
Definition backend.cpp:88
virtual bool do_restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state.
Definition backend.cpp:675
virtual int do_count_tokens(const std::string &text) const =0
Subclass token counting.
virtual bool do_supports(BackendCapability cap) const
Declare supported capabilities.
Definition backend.cpp:637
void deactivate()
Release GPU layers (ACTIVE → WARM).
Definition backend.cpp:117
virtual void do_unload()=0
Full unload.
virtual bool do_activate()=0
Promote loaded model to GPU.
BackendInfo info() const
Get backend metadata.
Definition backend.cpp:486
bool is_active() const
True when state is ACTIVE.
Definition backend.h:224
ModelState state() const
Current lifecycle state (lock-free read).
Definition backend.h:216
virtual bool do_load(const ModelConfig &config)=0
Load model into CPU RAM.
virtual GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Subclass speculative-decoding streaming generation.
Definition backend.cpp:286
virtual void do_deactivate()=0
Release GPU, keep CPU.
virtual GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams &params)=0
Subclass generation.
std::vector< BackendCapability > capabilities() const
Get all supported capabilities as a vector.
Definition backend.cpp:468
void unload()
Full unload (→ COLD).
Definition backend.cpp:139
const ModelConfig & config() const
Stored model config.
Definition backend.h:278
bool clear_state(int seq_id=-1)
Clear/reset model state for a sequence.
Definition backend.cpp:548
virtual GenerationResult do_generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params)
Generate with sequence ID.
Definition backend.cpp:701
virtual std::string do_backend_name() const =0
Return backend name identifier.
bool is_loaded() const
True when state is WARM or ACTIVE.
Definition backend.h:232
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams &params)
Generate a complete response.
Definition backend.cpp:182
bool load(const ModelConfig &config)
Load model into CPU RAM (COLD → WARM).
Definition backend.cpp:54
virtual bool do_clear_state(int seq_id)
Clear/reset model state.
Definition backend.cpp:688
int count_tokens(const std::string &text) const
Count tokens using model's tokenizer.
Definition backend.cpp:442
virtual bool do_save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state (KV cache or hidden state).
Definition backend.cpp:661
bool fire_model_load_hook(const ModelConfig &config)
Fire ON_MODEL_LOAD pre-hook.
Definition backend.cpp:417
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate with per-token streaming callback.
Definition backend.cpp:211
LogprobResult evaluate_logprobs(const int32_t *tokens, int n_tokens)
Evaluate per-token log-probabilities for a token sequence.
Definition backend.cpp:343
bool load_and_activate(const ModelConfig &config)
Convenience: load() + activate().
Definition backend.cpp:165
GenerationResult complete(const std::string &prompt, const GenerationParams &params)
Raw text completion without chat template.
Definition backend.cpp:308
GenerationResult generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with explicit sequence ID.
Definition backend.cpp:603
@ ENTROPIC_ERROR_NOT_SUPPORTED
Capability not supported by this backend (v1.9.13)
Definition error.h:84
@ ENTROPIC_ERROR_INVALID_STATE
Operation not valid in current state (e.g., generate before activate)
Definition error.h:39
@ ENTROPIC_HOOK_ON_MODEL_UNLOAD
14: Model unloaded from backend
Definition hooks.h:50
@ ENTROPIC_HOOK_ON_MODEL_LOAD
13: Model loaded into backend
Definition hooks.h:49
InferenceBackend concrete base class.
spdlog initialization and logger access.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211
Activate model on GPU (WARM → ACTIVE).
BackendCapability
Capabilities that an inference backend may or may not support.
@ _COUNT
Sentinel — must be last. Used for iteration/array sizing.
@ ok
Tool dispatched, returned non-empty content.
ModelState
C++ enum class for model VRAM lifecycle states.
Definition config.h:96
@ WARM
mmap'd + mlock'd in RAM
@ ACTIVE
GPU layers loaded, full speed.
@ COLD
On disk only, no RAM consumed.
Backend metadata for introspection.
std::string name
Backend identifier (e.g. "llama.cpp", "axcl")
Generation parameters for a single inference call.
Definition config.h:227
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string error_message
Error description (empty if no error)
Per-token log-probability evaluation result.
std::vector< float > logprobs
Log-prob for each token transition (N-1 values)
int n_logprobs
Number of logprob values (n_tokens - 1)
float total_logprob
Sum of all logprob values.
float perplexity
exp(-mean(logprobs)) — lower = less surprising
Model configuration for a single tier.
Definition config.h:148
std::filesystem::path path
Resolved model file path.
Definition config.h:149