Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
orchestrator.h
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
23#pragma once
24
34
35#include <chrono>
36#include <functional>
37#include <memory>
38#include <mutex>
39#include <string>
40#include <unordered_map>
41#include <unordered_set>
42#include <vector>
43
44struct llama_context; // Forward declaration for adapter management
45struct llama_model; // Forward declaration for speculative compat (v2.1.11)
46
47namespace entropic {
48
54 std::string tier_name;
55 std::string previous_tier;
56 std::string model_raw;
57 std::string swap_action = "none";
58 double routing_ms = 0.0;
59 std::string adapter_name;
60 double adapter_swap_ms = 0.0;
61};
62
72public:
79 bool initialize(const ParsedConfig& config);
80
85 void shutdown();
86
110
111 /* ── Generation ──────────────────────────────────────── */
112
122 const std::vector<Message>& messages,
123 const GenerationParams& params,
124 const std::string& tier_name = "");
125
131 const std::vector<Message>& messages,
132 const GenerationParams& params,
133 std::function<void(std::string_view)> on_token,
134 std::atomic<bool>& cancel,
135 const std::string& tier_name = "");
136
137 /* ── Routing ─────────────────────────────────────────── */
138
145 std::string route(const std::vector<Message>& messages);
146
147 /* ── Queries ─────────────────────────────────────────── */
148
154
159 std::string last_used_tier() const;
160
165 std::vector<std::string> loaded_models() const;
166
171 std::vector<std::string> available_models() const;
172
177 bool can_handoff(const std::string& from, const std::string& to) const;
178
183 ChatAdapter* get_adapter(const std::string& tier_name) const;
184
191 InferenceBackend* get_backend(const std::string& tier_name) const;
192
199 AdapterManager& adapter_manager() { return lora_manager_; }
200
207 GrammarRegistry& grammar_registry() { return grammar_registry_; }
208
215 ProfileRegistry& profile_registry() { return profile_registry_; }
216
223 ThroughputTracker& throughput_tracker() { return throughput_tracker_; }
224
231 size_t load_grammars_from(const std::filesystem::path& grammar_dir);
232
244
258 bool has_vision_capable_tier() const;
259
266 bool compatible = false;
267 std::string diagnostic;
268 };
269
287
301 void set_speculative_enabled(bool enabled) {
302 config_.inference.speculative.enabled = enabled;
303 }
304
305 /* ── VRAM-aware tier residency (v2.2.4, gh#57) ────────── */
306
313 enum class ResidencyEvent : int {
314 Loaded = 0,
315 Evicted = 1,
316 ActivationSwap = 2,
317 };
318
333 using ResidencyObserverFn = std::function<void(
334 ResidencyEvent event,
335 const std::string& tier_name,
336 const std::string& model_path,
337 size_t footprint)>;
338
349
361 std::string residency_snapshot_json() const;
362
374 size_t vram_budget_bytes() const { return vram_budget_bytes_; }
375
387 size_t tier_footprint_bytes(const std::string& tier_name) const;
388
400 entropic_error_t last_residency_error() const { return last_residency_error_; }
401
407 void clear_last_residency_error() { last_residency_error_ = ENTROPIC_OK; }
408
424 std::string select_vision_tier() const;
425
426private:
427 /* ── Model pool (one backend per unique path) ────────── */
428 std::unordered_map<std::string, std::shared_ptr<InferenceBackend>> model_pool_;
429
430 /* ── Tier → backend mapping (many-to-one) ────────────── */
431 std::unordered_map<std::string, std::shared_ptr<InferenceBackend>> tiers_;
432
433 /* ── Per-tier adapters (one-to-one, identity-specific) ── */
434 std::unordered_map<std::string, std::unique_ptr<ChatAdapter>> adapters_;
435
436 /* ── Secondary models (router, draft, future thinking) ── */
447 SecondaryModelLoader secondary_loader_;
448
449 /* ── Routing state ───────────────────────────────────── */
450 std::unordered_map<std::string, std::string> tier_map_;
451 std::unordered_map<std::string, std::unordered_set<std::string>> handoff_rules_;
452 std::string default_tier_;
453 std::string loaded_main_tier_;
454 RoutingResult last_routing_result_;
455 std::vector<std::string> tier_history_;
456
457 mutable std::mutex swap_mutex_;
458
459 ParsedConfig config_;
460
461 /* ── Residency tracking (v2.2.4, gh#57) ──────────────── */
462
472 mutable std::unordered_map<std::string, size_t> tier_footprint_bytes_;
473
480 std::unordered_map<std::string, long long> tier_last_activation_ms_;
481
486 std::chrono::steady_clock::time_point start_time_{std::chrono::steady_clock::now()};
487
497 size_t vram_budget_bytes_{0};
498
503 ResidencyObserverFn residency_observer_;
504
509 entropic_error_t last_residency_error_{ENTROPIC_OK};
510
524 size_t estimate_footprint_bytes(const std::string& tier_name) const;
525
536 static size_t resolve_vram_budget_bytes();
537
548 void fire_residency_observer(
549 ResidencyEvent event,
550 const std::string& tier_name,
551 const std::string& model_path,
552 size_t footprint);
553
554 /* ── LoRA adapter management (v1.9.2) ────────────────── */
555 AdapterManager lora_manager_;
556
557 /* ── Grammar registry (v1.9.3) ────────────────────────── */
558 GrammarRegistry grammar_registry_;
559
560 /* ── Profile registry (v2.0.0) ───────────────────────── */
561 ProfileRegistry profile_registry_;
562
563 /* ── Throughput tracker (v2.0.0) ─────────────────────── */
564 ThroughputTracker throughput_tracker_;
565
566 /* ── Internal ────────────────────────────────────────── */
567
572 InferenceBackend* get_model(const std::string& tier_name);
573
581 void record_activation_reuse(const std::string& tier_name);
582
590 bool residency_admits(const std::string& tier_name);
591
598 InferenceBackend* activate_and_track(
599 const std::string& tier_name,
600 const std::shared_ptr<InferenceBackend>& backend);
601
609 GenerationResult build_no_model_error(const std::string& tier_name);
610
615 void deactivate_current_if_needed(InferenceBackend* incoming);
616
629 void ensure_tier_lora(const std::string& tier_name,
630 InferenceBackend* result);
631
642 void unload_or_warm_current(InferenceBackend* current);
643
648 std::pair<std::string, std::string> classify_task(
649 const std::vector<Message>& messages);
650
657 bool deactivate_if_active(llama_context* ctx);
658
670 double ensure_adapter_for_tier(
671 const std::string& tier_name, llama_context* ctx);
672
677 void preload_adapters();
678
686 bool create_tier_backends(const ParsedConfig& config);
687
694 void build_routing_tables(const ParsedConfig& config);
695
703 bool activate_default_tier(const ParsedConfig& config);
704
711 void activate_router(const ParsedConfig& config);
712
728 void activate_draft(const ParsedConfig& config);
729
734 void load_bundled_grammars();
735
742 void resolve_grammar_key(GenerationParams& params,
743 const std::string& tier_name);
744
755 std::string resolve_speculative_pair(
756 llama_model*& target_out, llama_model*& draft_out) const;
757
765 bool try_speculative_route(
766 InferenceBackend* model,
767 const std::vector<Message>& messages,
768 const GenerationParams& params,
769 GenerationResult& result);
770
779 GenerationResult run_generate_dispatch(
780 InferenceBackend* model,
781 const std::vector<Message>& messages,
782 const GenerationParams& params);
783
791 bool try_speculative_route_streaming(
792 InferenceBackend* model,
793 const std::vector<Message>& messages,
794 const GenerationParams& params,
795 std::function<void(std::string_view)> on_token,
796 std::atomic<bool>& cancel,
797 GenerationResult& result);
798};
799
800} // namespace entropic
ChatAdapter concrete base class.
AdapterManager — LoRA adapter lifecycle and hot-swap.
LoRA adapter lifecycle manager.
Concrete base class for chat format adapters (80% logic).
Centralized grammar registry for named GBNF grammars.
Concrete base class for inference backends (80% logic).
Definition backend.h:69
Multi-model lifecycle and routing orchestrator.
size_t vram_budget_bytes() const
Engine-tracked VRAM budget in bytes (0 = unknown).
SpeculativeCompatInfo check_speculative_compat() const
Check whether the currently-configured target/draft pair is compatible for speculative decoding.
std::vector< std::string > available_models() const
All configured tier names.
size_t load_grammars_from(const std::filesystem::path &grammar_dir)
Load grammars from an explicit directory path.
GrammarRegistry & grammar_registry()
Access the grammar registry.
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > &cancel, const std::string &tier_name="")
Streaming generation.
std::vector< std::string > loaded_models() const
Currently loaded model tier names.
bool initialize(const ParsedConfig &config)
Initialize from parsed config.
bool has_vision_capable_tier() const
Return true if any configured tier declares the "vision" capability (gh#41, v2.1.8).
size_t tier_footprint_bytes(const std::string &tier_name) const
Estimated VRAM footprint for a given tier in bytes.
AdapterManager & adapter_manager()
Access the LoRA adapter manager.
void shutdown()
Shutdown — unload all models.
RoutingResult last_routing_result() const
Last routing result.
std::function< void(ResidencyEvent event, const std::string &tier_name, const std::string &model_path, size_t footprint)> ResidencyObserverFn
Residency observer callback type (internal C++ form).
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams &params, const std::string &tier_name="")
Generate using routed or explicit tier.
void clear_last_residency_error()
Clear last_residency_error().
void clear_all_prompt_caches()
Invalidate prompt/KV caches across every pooled backend.
entropic_error_t last_residency_error() const
Last residency-related error code, or ENTROPIC_OK if none.
std::string route(const std::vector< Message > &messages)
Route to tier using router model.
ChatAdapter * get_adapter(const std::string &tier_name) const
Get adapter for a tier.
void set_residency_observer(ResidencyObserverFn cb)
Register a residency observer.
std::string last_used_tier() const
Last used tier name.
~ModelOrchestrator()
Destructor — invokes shutdown() and AdapterManager::unload_all().
std::string select_vision_tier() const
Pick the canonical vision-capable tier name (gh#41).
ProfileRegistry & profile_registry()
Access the GPU resource profile registry.
ThroughputTracker & throughput_tracker()
Access the throughput tracker.
void set_speculative_enabled(bool enabled)
Runtime toggle for the speculative-decoding path.
bool can_handoff(const std::string &from, const std::string &to) const
Check if handoff is permitted.
std::string residency_snapshot_json() const
Serialize the current residency set as a JSON string.
ResidencyEvent
Residency observer event codes — mirror the C ABI enum entropic_residency_event_t exactly (LOADED=0,...
InferenceBackend * get_backend(const std::string &tier_name) const
Get the inference backend for a tier (for evaluation APIs).
Centralized registry for named GPU resource profiles.
Role-keyed lifecycle manager for non-primary models.
EWMA-based throughput tracker for generation budgeting.
Configuration structs with defaults.
Error types for cross-.so error reporting.
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35
@ ENTROPIC_OK
Success.
Definition error.h:36
GrammarRegistry — named grammar management and validation.
InferenceBackend concrete base class.
Activate model on GPU (WARM → ACTIVE).
ProfileRegistry – named GPU resource profile management.
Unified lifecycle for non-primary inference backends.
Generation parameters for a single inference call.
Definition config.h:227
Result of a single generation call.
SpeculativeConfig speculative
Speculative decoding (gh#36)
Definition config.h:703
Result of a speculative-decoding compatibility check.
bool compatible
true when speculative may proceed
std::string diagnostic
Reason on failure (empty on ok)
Full parsed configuration.
Definition config.h:714
InferenceConfig inference
Inference-side knobs (currently speculative decoding only).
Definition config.h:760
Result metadata from a routing decision.
std::string adapter_name
Active adapter (empty = base model) (v1.9.2)
double routing_ms
Total routing time.
std::string model_raw
Raw model output (e.g. "2")
std::string tier_name
Selected tier.
std::string swap_action
"none", "reused", "loaded"
std::string previous_tier
Previous tier (empty if first)
double adapter_swap_ms
Adapter swap latency (v1.9.2)
bool enabled
Master switch (off by default)
Definition config.h:673
ThroughputTracker – real-time throughput measurement and prediction.