40#include <unordered_map>
41#include <unordered_set>
122 const std::vector<Message>& messages,
124 const std::string& tier_name =
"");
131 const std::vector<Message>& messages,
133 std::function<
void(std::string_view)> on_token,
134 std::atomic<bool>& cancel,
135 const std::string& tier_name =
"");
145 std::string
route(
const std::vector<Message>& messages);
177 bool can_handoff(
const std::string& from,
const std::string& to)
const;
335 const std::string& tier_name,
336 const std::string& model_path,
428 std::unordered_map<std::string, std::shared_ptr<InferenceBackend>> model_pool_;
431 std::unordered_map<std::string, std::shared_ptr<InferenceBackend>> tiers_;
434 std::unordered_map<std::string, std::unique_ptr<ChatAdapter>> adapters_;
450 std::unordered_map<std::string, std::string> tier_map_;
451 std::unordered_map<std::string, std::unordered_set<std::string>> handoff_rules_;
452 std::string default_tier_;
453 std::string loaded_main_tier_;
455 std::vector<std::string> tier_history_;
457 mutable std::mutex swap_mutex_;
472 mutable std::unordered_map<std::string, size_t> tier_footprint_bytes_;
480 std::unordered_map<std::string, long long> tier_last_activation_ms_;
486 std::chrono::steady_clock::time_point start_time_{std::chrono::steady_clock::now()};
497 size_t vram_budget_bytes_{0};
524 size_t estimate_footprint_bytes(
const std::string& tier_name)
const;
536 static size_t resolve_vram_budget_bytes();
548 void fire_residency_observer(
550 const std::string& tier_name,
551 const std::string& model_path,
555 AdapterManager lora_manager_;
558 GrammarRegistry grammar_registry_;
561 ProfileRegistry profile_registry_;
564 ThroughputTracker throughput_tracker_;
572 InferenceBackend* get_model(
const std::string& tier_name);
581 void record_activation_reuse(
const std::string& tier_name);
590 bool residency_admits(
const std::string& tier_name);
598 InferenceBackend* activate_and_track(
599 const std::string& tier_name,
600 const std::shared_ptr<InferenceBackend>& backend);
609 GenerationResult build_no_model_error(
const std::string& tier_name);
615 void deactivate_current_if_needed(InferenceBackend* incoming);
629 void ensure_tier_lora(
const std::string& tier_name,
630 InferenceBackend* result);
642 void unload_or_warm_current(InferenceBackend* current);
648 std::pair<std::string, std::string> classify_task(
649 const std::vector<Message>& messages);
657 bool deactivate_if_active(llama_context* ctx);
670 double ensure_adapter_for_tier(
671 const std::string& tier_name, llama_context* ctx);
677 void preload_adapters();
686 bool create_tier_backends(
const ParsedConfig& config);
694 void build_routing_tables(
const ParsedConfig& config);
703 bool activate_default_tier(
const ParsedConfig& config);
711 void activate_router(
const ParsedConfig& config);
728 void activate_draft(
const ParsedConfig& config);
734 void load_bundled_grammars();
742 void resolve_grammar_key(GenerationParams& params,
743 const std::string& tier_name);
755 std::string resolve_speculative_pair(
756 llama_model*& target_out, llama_model*& draft_out)
const;
765 bool try_speculative_route(
766 InferenceBackend* model,
767 const std::vector<Message>& messages,
768 const GenerationParams& params,
769 GenerationResult& result);
779 GenerationResult run_generate_dispatch(
780 InferenceBackend* model,
781 const std::vector<Message>& messages,
782 const GenerationParams& params);
791 bool try_speculative_route_streaming(
792 InferenceBackend* model,
793 const std::vector<Message>& messages,
794 const GenerationParams& params,
795 std::function<
void(std::string_view)> on_token,
796 std::atomic<bool>& cancel,
797 GenerationResult& result);
ChatAdapter concrete base class.
AdapterManager — LoRA adapter lifecycle and hot-swap.
LoRA adapter lifecycle manager.
Concrete base class for chat format adapters (80% logic).
Centralized grammar registry for named GBNF grammars.
Concrete base class for inference backends (80% logic).
Multi-model lifecycle and routing orchestrator.
size_t vram_budget_bytes() const
Engine-tracked VRAM budget in bytes (0 = unknown).
SpeculativeCompatInfo check_speculative_compat() const
Check whether the currently-configured target/draft pair is compatible for speculative decoding.
std::vector< std::string > available_models() const
All configured tier names.
size_t load_grammars_from(const std::filesystem::path &grammar_dir)
Load grammars from an explicit directory path.
GrammarRegistry & grammar_registry()
Access the grammar registry.
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view)> on_token, std::atomic< bool > &cancel, const std::string &tier_name="")
Streaming generation.
std::vector< std::string > loaded_models() const
Currently loaded model tier names.
bool initialize(const ParsedConfig &config)
Initialize from parsed config.
bool has_vision_capable_tier() const
Return true if any configured tier declares the "vision" capability (gh#41, v2.1.8).
size_t tier_footprint_bytes(const std::string &tier_name) const
Estimated VRAM footprint for a given tier in bytes.
AdapterManager & adapter_manager()
Access the LoRA adapter manager.
void shutdown()
Shutdown — unload all models.
RoutingResult last_routing_result() const
Last routing result.
std::function< void(ResidencyEvent event, const std::string &tier_name, const std::string &model_path, size_t footprint)> ResidencyObserverFn
Residency observer callback type (internal C++ form).
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams ¶ms, const std::string &tier_name="")
Generate using routed or explicit tier.
void clear_last_residency_error()
Clear last_residency_error().
void clear_all_prompt_caches()
Invalidate prompt/KV caches across every pooled backend.
entropic_error_t last_residency_error() const
Last residency-related error code, or ENTROPIC_OK if none.
std::string route(const std::vector< Message > &messages)
Route to tier using router model.
ChatAdapter * get_adapter(const std::string &tier_name) const
Get adapter for a tier.
void set_residency_observer(ResidencyObserverFn cb)
Register a residency observer.
std::string last_used_tier() const
Last used tier name.
~ModelOrchestrator()
Destructor — invokes shutdown() and AdapterManager::unload_all().
std::string select_vision_tier() const
Pick the canonical vision-capable tier name (gh#41).
ProfileRegistry & profile_registry()
Access the GPU resource profile registry.
ThroughputTracker & throughput_tracker()
Access the throughput tracker.
void set_speculative_enabled(bool enabled)
Runtime toggle for the speculative-decoding path.
bool can_handoff(const std::string &from, const std::string &to) const
Check if handoff is permitted.
std::string residency_snapshot_json() const
Serialize the current residency set as a JSON string.
ResidencyEvent
Residency observer event codes — mirror the C ABI enum entropic_residency_event_t exactly (LOADED=0,...
InferenceBackend * get_backend(const std::string &tier_name) const
Get the inference backend for a tier (for evaluation APIs).
Centralized registry for named GPU resource profiles.
Role-keyed lifecycle manager for non-primary models.
EWMA-based throughput tracker for generation budgeting.
Configuration structs with defaults.
Error types for cross-.so error reporting.
entropic_error_t
Error codes returned by all C API functions.
GrammarRegistry — named grammar management and validation.
InferenceBackend concrete base class.
Activate model on GPU (WARM → ACTIVE).
ProfileRegistry – named GPU resource profile management.
Unified lifecycle for non-primary inference backends.
Generation parameters for a single inference call.
Result of a single generation call.
SpeculativeConfig speculative
Speculative decoding (gh#36)
Result of a speculative-decoding compatibility check.
bool compatible
true when speculative may proceed
std::string diagnostic
Reason on failure (empty on ok)
Full parsed configuration.
InferenceConfig inference
Inference-side knobs (currently speculative decoding only).
Result metadata from a routing decision.
std::string adapter_name
Active adapter (empty = base model) (v1.9.2)
double routing_ms
Total routing time.
std::string model_raw
Raw model output (e.g. "2")
std::string tier_name
Selected tier.
std::string swap_action
"none", "reused", "loaded"
std::string previous_tier
Previous tier (empty if first)
double adapter_swap_ms
Adapter swap latency (v1.9.2)
bool enabled
Master switch (off by default)
ThroughputTracker – real-time throughput measurement and prediction.