Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
backend.h
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
32#pragma once
33
40
41#include <atomic>
42#include <cstdint>
43#include <functional>
44#include <mutex>
45#include <string>
46#include <string_view>
47#include <vector>
48
49namespace entropic {
50
70public:
71 virtual ~InferenceBackend() = default;
72
73 /* ── Lifecycle (template methods — own the state machine) ── */
74
81 bool load(const ModelConfig& config);
82
88 bool activate();
89
94 void deactivate();
95
100 void unload();
101
109
110 /* ── Generation (require ACTIVE state) ───────────────── */
111
121 const std::vector<Message>& messages,
122 const GenerationParams& params);
123
134 const std::vector<Message>& messages,
135 const GenerationParams& params,
136 std::function<void(std::string_view token)> on_token,
137 std::atomic<bool>& cancel);
138
156 const std::vector<Message>& messages,
157 const GenerationParams& params,
158 std::function<void(std::string_view token)> on_token,
159 std::atomic<bool>& cancel);
160
169 const std::string& prompt,
170 const GenerationParams& params);
171
172 /* ── Evaluation (require ACTIVE state) ─────────────── */
173
190 const int32_t* tokens,
191 int n_tokens);
192
204 float compute_perplexity(
205 const int32_t* tokens,
206 int n_tokens);
207
208 /* ── Queries (lock-free) ─────────────────────────────── */
209
216 ModelState state() const { return state_.load(std::memory_order_acquire); }
217
224 bool is_active() const { return state() == ModelState::ACTIVE; }
225
232 bool is_loaded() const { return state() != ModelState::COLD; }
233
239 int count_tokens(const std::string& text) const;
240
248 virtual std::vector<int32_t> tokenize_text(
249 const std::string& text) const { return {}; }
250
257 int context_length() const { return config_.context_length; }
258
270 virtual void clear_prompt_cache() {}
271
278 const ModelConfig& config() const { return config_; }
279
280 /* ── Capability queries (v1.9.13) ────────────────────── */
281
293 bool supports(BackendCapability cap) const;
294
304 std::vector<BackendCapability> capabilities() const;
305
306 /* ── Backend metadata (v1.9.13) ──────────────────────── */
307
319 BackendInfo info() const;
320
321 /* ── Model state management (v1.9.13) ────────────────── */
322
334 bool save_state(int seq_id, std::vector<uint8_t>& buffer) const;
335
343 bool restore_state(int seq_id, const std::vector<uint8_t>& buffer);
344
355 bool clear_state(int seq_id = -1);
356
357 /* ── Multi-sequence generation (v1.9.13) ─────────────── */
358
371 int seq_id,
372 const std::vector<Message>& messages,
373 const GenerationParams& params);
374
386 int seq_id,
387 const std::vector<Message>& messages,
388 const GenerationParams& params,
389 std::function<void(std::string_view token)> on_token,
390 std::atomic<bool>& cancel);
391
392protected:
393 /* ── Subclass overrides (20%) ────────────────────────── */
394
401 virtual bool do_load(const ModelConfig& config) = 0;
402
407 virtual bool do_activate() = 0;
408
413 virtual void do_deactivate() = 0;
414
419 virtual void do_unload() = 0;
420
429 const std::vector<Message>& messages,
430 const GenerationParams& params) = 0;
431
442 const std::vector<Message>& messages,
443 const GenerationParams& params,
444 std::function<void(std::string_view token)> on_token,
445 std::atomic<bool>& cancel) = 0;
446
471 const std::vector<Message>& messages,
472 const GenerationParams& params,
473 std::function<void(std::string_view token)> on_token,
474 std::atomic<bool>& cancel);
475
484 const std::string& prompt,
485 const GenerationParams& params) = 0;
486
493 virtual int do_count_tokens(const std::string& text) const = 0;
494
511 const int32_t* tokens,
512 int n_tokens) = 0;
513
514 /* ── New overridable methods (v1.9.13) ───────────────── */
515
525 virtual bool do_supports(BackendCapability cap) const;
526
535 virtual std::string do_backend_name() const = 0;
536
545 virtual BackendInfo do_info() const;
546
554 virtual bool do_save_state(int seq_id,
555 std::vector<uint8_t>& buffer) const;
556
564 virtual bool do_restore_state(int seq_id,
565 const std::vector<uint8_t>& buffer);
566
573 virtual bool do_clear_state(int seq_id);
574
587 int seq_id,
588 const std::vector<Message>& messages,
589 const GenerationParams& params);
590
605 int seq_id,
606 const std::vector<Message>& messages,
607 const GenerationParams& params,
608 std::function<void(std::string_view token)> on_token,
609 std::atomic<bool>& cancel);
610
611 std::string last_error_;
612
620
627 void set_hooks(const HookInterface& hooks) { hooks_ = hooks; }
628
629private:
630 std::atomic<ModelState> state_{ModelState::COLD};
631 ModelConfig config_;
632 std::mutex transition_mutex_;
633 std::mutex eval_mutex_;
634 HookInterface hooks_;
635};
636
637} // namespace entropic
Backend capability flags and metadata for architecture-agnostic queries.
Concrete base class for inference backends (80% logic).
Definition backend.h:69
virtual GenerationResult do_complete(const std::string &prompt, const GenerationParams &params)=0
Subclass raw completion.
virtual GenerationResult do_generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with sequence ID.
Definition backend.cpp:720
GenerationResult generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params)
Generate with explicit sequence ID.
Definition backend.cpp:571
virtual LogprobResult do_evaluate_logprobs(const int32_t *tokens, int n_tokens)=0
Backend-specific logprob evaluation.
GenerationResult generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate via the speculative-decoding kernel (v2.1.11).
Definition backend.cpp:248
float compute_perplexity(const int32_t *tokens, int n_tokens)
Compute perplexity for a token sequence.
Definition backend.cpp:401
std::string last_error_
Last error message for diagnostics.
Definition backend.h:611
virtual BackendInfo do_info() const
Populate backend metadata.
Definition backend.cpp:647
virtual GenerationResult do_generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)=0
Subclass streaming generation.
bool save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state to buffer.
Definition backend.cpp:500
void set_hooks(const HookInterface &hooks)
Set the hook dispatch interface.
Definition backend.h:627
bool supports(BackendCapability cap) const
Query whether this backend supports a capability.
Definition backend.cpp:458
bool restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state from buffer.
Definition backend.cpp:524
bool activate()
Promote to GPU (WARM → ACTIVE).
Definition backend.cpp:88
virtual bool do_restore_state(int seq_id, const std::vector< uint8_t > &buffer)
Restore model state.
Definition backend.cpp:675
virtual int do_count_tokens(const std::string &text) const =0
Subclass token counting.
virtual bool do_supports(BackendCapability cap) const
Declare supported capabilities.
Definition backend.cpp:637
void deactivate()
Release GPU layers (ACTIVE → WARM).
Definition backend.cpp:117
virtual void do_unload()=0
Full unload.
virtual bool do_activate()=0
Promote loaded model to GPU.
BackendInfo info() const
Get backend metadata.
Definition backend.cpp:486
bool is_active() const
True when state is ACTIVE.
Definition backend.h:224
ModelState state() const
Current lifecycle state (lock-free read).
Definition backend.h:216
virtual bool do_load(const ModelConfig &config)=0
Load model into CPU RAM.
virtual GenerationResult do_generate_speculative(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Subclass speculative-decoding streaming generation.
Definition backend.cpp:286
virtual void clear_prompt_cache()
Invalidate any backend-owned prompt/KV caches.
Definition backend.h:270
virtual void do_deactivate()=0
Release GPU, keep CPU.
virtual std::vector< int32_t > tokenize_text(const std::string &text) const
Tokenize text to token IDs.
Definition backend.h:248
virtual GenerationResult do_generate(const std::vector< Message > &messages, const GenerationParams &params)=0
Subclass generation.
std::vector< BackendCapability > capabilities() const
Get all supported capabilities as a vector.
Definition backend.cpp:468
void unload()
Full unload (→ COLD).
Definition backend.cpp:139
const ModelConfig & config() const
Stored model config.
Definition backend.h:278
bool clear_state(int seq_id=-1)
Clear/reset model state for a sequence.
Definition backend.cpp:548
virtual GenerationResult do_generate_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params)
Generate with sequence ID.
Definition backend.cpp:701
virtual std::string do_backend_name() const =0
Return backend name identifier.
bool is_loaded() const
True when state is WARM or ACTIVE.
Definition backend.h:232
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams &params)
Generate a complete response.
Definition backend.cpp:182
bool load(const ModelConfig &config)
Load model into CPU RAM (COLD → WARM).
Definition backend.cpp:54
virtual bool do_clear_state(int seq_id)
Clear/reset model state.
Definition backend.cpp:688
int count_tokens(const std::string &text) const
Count tokens using model's tokenizer.
Definition backend.cpp:442
virtual bool do_save_state(int seq_id, std::vector< uint8_t > &buffer) const
Save model state (KV cache or hidden state).
Definition backend.cpp:661
bool fire_model_load_hook(const ModelConfig &config)
Fire ON_MODEL_LOAD pre-hook.
Definition backend.cpp:417
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate with per-token streaming callback.
Definition backend.cpp:211
LogprobResult evaluate_logprobs(const int32_t *tokens, int n_tokens)
Evaluate per-token log-probabilities for a token sequence.
Definition backend.cpp:343
int context_length() const
Model's context window size.
Definition backend.h:257
bool load_and_activate(const ModelConfig &config)
Convenience: load() + activate().
Definition backend.cpp:165
GenerationResult complete(const std::string &prompt, const GenerationParams &params)
Raw text completion without chat template.
Definition backend.cpp:308
GenerationResult generate_streaming_seq(int seq_id, const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Streaming generation with explicit sequence ID.
Definition backend.cpp:603
Configuration structs with defaults.
Generation output with metrics.
Hook dispatch interface injected into engine subsystems.
Per-token log-probability evaluation result.
Message struct for conversation history.
Activate model on GPU (WARM → ACTIVE).
BackendCapability
Capabilities that an inference backend may or may not support.
ModelState
C++ enum class for model VRAM lifecycle states.
Definition config.h:96
@ ACTIVE
GPU layers loaded, full speed.
@ COLD
On disk only, no RAM consumed.
Backend metadata for introspection.
Generation parameters for a single inference call.
Definition config.h:227
Result of a single generation call.
Per-token log-probability evaluation result.
Model configuration for a single tier.
Definition config.h:148
int context_length
Context window size (512–131072)
Definition config.h:151