entropic/orchestrator_8h_source.html

// SPDX-License-Identifier: Apache-2.0

#pragma once


#include <entropic/inference/backend.h>

#include <entropic/inference/adapter_manager.h>

#include <entropic/inference/grammar_registry.h>

#include <entropic/inference/profile_registry.h>

#include <entropic/inference/secondary_model_loader.h>

#include <entropic/inference/throughput_tracker.h>

#include <entropic/inference/adapters/adapter_base.h>

#include <entropic/types/config.h>

#include <entropic/types/error.h>


#include <chrono>

#include <functional>

#include <memory>

#include <mutex>

#include <string>

#include <unordered_map>

#include <unordered_set>

#include <vector>


struct llama_context;  // Forward declaration for adapter management

struct llama_model;    // Forward declaration for speculative compat (v2.1.11)


namespace entropic {


struct RoutingResult {

    std::string tier_name;

    std::string previous_tier;

    std::string model_raw;

    std::string swap_action = "none";

    double routing_ms = 0.0;

    std::string adapter_name;

    double adapter_swap_ms = 0.0;

};


struct TierSamplerOverrides {

    std::optional<float> temperature;

    std::optional<int>   max_output_tokens;

    std::optional<float> top_p;

    std::optional<int>   top_k;

    std::optional<float> min_p;

    std::optional<float> presence_penalty;

    std::optional<float> frequency_penalty;

    std::optional<float> repeat_penalty;

    std::optional<bool>  enable_thinking;

    std::optional<std::string> tool_call_mode;

};


ENTROPIC_EXPORT void apply_tier_sampler_overrides(

    GenerationParams& params,

    const TierSamplerOverrides& ov);


class ModelOrchestrator {

public:

    bool initialize(const ParsedConfig& config);


    void shutdown();


    ~ModelOrchestrator();


    /* ── Generation ──────────────────────────────────────── */


    GenerationResult generate(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        const std::string& tier_name = "");


    GenerationResult generate(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::atomic<bool>& cancel,

        const std::string& tier_name = "");


    std::vector<GenerationResult> generate_batch(

        const std::vector<std::vector<Message>>& messages_list,

        const std::vector<GenerationParams>& params_list,

        const std::vector<std::string>& tiers,

        std::atomic<bool>& cancel);


    GenerationResult generate_streaming(

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view)> on_token,

        std::atomic<bool>& cancel,

        const std::string& tier_name = "");


    /* ── Routing ─────────────────────────────────────────── */


    std::string route(const std::vector<Message>& messages);


    /* ── Queries ─────────────────────────────────────────── */


    RoutingResult last_routing_result() const;


    std::string last_used_tier() const;


    std::vector<std::string> loaded_models() const;


    std::vector<std::string> available_models() const;


    bool can_handoff(const std::string& from, const std::string& to) const;


    ChatAdapter* get_adapter(const std::string& tier_name) const;


    InferenceBackend* get_backend(const std::string& tier_name) const;


    AdapterManager& adapter_manager() { return lora_manager_; }


    GrammarRegistry& grammar_registry() { return grammar_registry_; }


    ProfileRegistry& profile_registry() { return profile_registry_; }


    ThroughputTracker& throughput_tracker() { return throughput_tracker_; }


    size_t load_grammars_from(const std::filesystem::path& grammar_dir);


    void clear_all_prompt_caches();


    bool has_vision_capable_tier() const;


    struct SpeculativeCompatInfo {

        bool compatible = false;

        std::string diagnostic;

    };


    SpeculativeCompatInfo check_speculative_compat() const;


    void set_speculative_enabled(bool enabled) {

        config_.inference.speculative.enabled = enabled;

    }


    /* ── VRAM-aware tier residency (v2.2.4, gh#57) ────────── */


    enum class ResidencyEvent : int {

        Loaded = 0,

        Evicted = 1,

        ActivationSwap = 2,

    };


    using ResidencyObserverFn = std::function<void(

        ResidencyEvent event,

        const std::string& tier_name,

        const std::string& model_path,

        size_t footprint)>;


    void set_residency_observer(ResidencyObserverFn cb);


    std::string residency_snapshot_json() const;


    size_t vram_budget_bytes() const { return vram_budget_bytes_; }


    size_t tier_footprint_bytes(const std::string& tier_name) const;


    entropic_error_t last_residency_error() const { return last_residency_error_; }


    void clear_last_residency_error() { last_residency_error_ = ENTROPIC_OK; }


    std::string select_vision_tier() const;


    void apply_tier_sampler_defaults_for_test(GenerationParams& params,

                                              const std::string& tier_name) {

        apply_tier_sampler_defaults(params, tier_name);

    }


private:

    /* ── Model pool (one backend per unique path) ────────── */

    std::unordered_map<std::string, std::shared_ptr<InferenceBackend>> model_pool_;


    /* ── Tier → backend mapping (many-to-one) ────────────── */

    std::unordered_map<std::string, std::shared_ptr<InferenceBackend>> tiers_;


    /* ── Per-tier adapters (one-to-one, identity-specific) ── */

    std::unordered_map<std::string, std::unique_ptr<ChatAdapter>> adapters_;


    /* ── Secondary models (router, draft, future thinking) ── */

    SecondaryModelLoader secondary_loader_;


    /* ── Routing state ───────────────────────────────────── */

    std::unordered_map<std::string, std::string> tier_map_;

    std::unordered_map<std::string, std::unordered_set<std::string>> handoff_rules_;

    std::string default_tier_;

    std::string loaded_main_tier_;

    RoutingResult last_routing_result_;

    std::vector<std::string> tier_history_;


    mutable std::mutex swap_mutex_;


    ParsedConfig config_;


    /* ── Residency tracking (v2.2.4, gh#57) ──────────────── */


    mutable std::unordered_map<std::string, size_t> tier_footprint_bytes_;


    std::unordered_map<std::string, long long> tier_last_activation_ms_;


    std::chrono::steady_clock::time_point start_time_{std::chrono::steady_clock::now()};


    size_t vram_budget_bytes_{0};


    ResidencyObserverFn residency_observer_;


    entropic_error_t last_residency_error_{ENTROPIC_OK};


    size_t estimate_footprint_bytes(const std::string& tier_name) const;


    static size_t resolve_vram_budget_bytes();


    void fire_residency_observer(

        ResidencyEvent event,

        const std::string& tier_name,

        const std::string& model_path,

        size_t footprint);


    /* ── LoRA adapter management (v1.9.2) ────────────────── */

    AdapterManager lora_manager_;


    /* ── Grammar registry (v1.9.3) ────────────────────────── */

    GrammarRegistry grammar_registry_;


    /* ── Profile registry (v2.0.0) ───────────────────────── */

    ProfileRegistry profile_registry_;


    /* ── Throughput tracker (v2.0.0) ─────────────────────── */

    ThroughputTracker throughput_tracker_;


    /* ── Internal ────────────────────────────────────────── */


    InferenceBackend* get_model(const std::string& tier_name);


    void record_activation_reuse(const std::string& tier_name);


    bool residency_admits(const std::string& tier_name);


    InferenceBackend* activate_and_track(

        const std::string& tier_name,

        const std::shared_ptr<InferenceBackend>& backend);


    GenerationResult build_no_model_error(const std::string& tier_name);


    void deactivate_current_if_needed(InferenceBackend* incoming);


    void ensure_tier_lora(const std::string& tier_name,

                          InferenceBackend* result);


    void unload_or_warm_current(InferenceBackend* current);


    std::pair<std::string, std::string> classify_task(

        const std::vector<Message>& messages);


    bool deactivate_if_active(llama_context* ctx);


    double ensure_adapter_for_tier(

        const std::string& tier_name, llama_context* ctx);


    void preload_adapters();


    bool create_tier_backends(const ParsedConfig& config);


    void build_routing_tables(const ParsedConfig& config);


    bool activate_default_tier(const ParsedConfig& config);


    void activate_router(const ParsedConfig& config);


    void activate_draft(const ParsedConfig& config);


    void load_bundled_grammars();


    void resolve_grammar_key(GenerationParams& params,

                             const std::string& tier_name);


    void apply_tier_sampler_defaults(GenerationParams& params,

                                     const std::string& tier_name);


    GenerationParams resolve_and_stage(InferenceBackend* model,

                                       const GenerationParams& params,

                                       const std::string& tier_name);


    std::string resolve_speculative_pair(

        llama_model*& target_out, llama_model*& draft_out) const;


    bool try_speculative_route(

        InferenceBackend* model,

        const std::vector<Message>& messages,

        const GenerationParams& params,

        const std::string& tier_name,

        GenerationResult& result);


    GenerationResult run_generate_dispatch(

        InferenceBackend* model,

        const std::vector<Message>& messages,

        const GenerationParams& params,

        const std::string& tier_name);


    bool try_speculative_route_streaming(

        InferenceBackend* model,

        const std::vector<Message>& messages,

        const GenerationParams& params,

        const std::string& tier_name,

        std::function<void(std::string_view)> on_token,

        std::atomic<bool>& cancel,

        GenerationResult& result);


    bool try_mtp_route(

        InferenceBackend* model,

        const std::vector<Message>& messages,

        const GenerationParams& params,

        std::function<void(std::string_view)> on_token,

        std::atomic<bool>& cancel,

        GenerationResult& result);


    bool resolve_mtp_effective(const std::string& tier_name) const;

};


} // namespace entropic

adapter_base.h
ChatAdapter concrete base class.

adapter_manager.h
AdapterManager — LoRA adapter lifecycle and hot-swap.

entropic::AdapterManager
LoRA adapter lifecycle manager.
Definition adapter_manager.h:58

entropic::ChatAdapter
Concrete base class for chat format adapters (80% logic).
Definition adapter_base.h:112

entropic::GrammarRegistry
Centralized grammar registry for named GBNF grammars.
Definition grammar_registry.h:45

entropic::InferenceBackend
Concrete base class for inference backends (80% logic).
Definition backend.h:69

entropic::ModelOrchestrator
Multi-model lifecycle and routing orchestrator.
Definition orchestrator.h:112

entropic::ModelOrchestrator::vram_budget_bytes
size_t vram_budget_bytes() const
Engine-tracked VRAM budget in bytes (0 = unknown).
Definition orchestrator.h:453

entropic::ModelOrchestrator::check_speculative_compat
SpeculativeCompatInfo check_speculative_compat() const
Check whether the currently-configured target/draft pair is compatible for speculative decoding.
Definition orchestrator.cpp:1508

entropic::ModelOrchestrator::available_models
std::vector< std::string > available_models() const
All configured tier names.
Definition orchestrator.cpp:1174

entropic::ModelOrchestrator::load_grammars_from
size_t load_grammars_from(const std::filesystem::path &grammar_dir)
Load grammars from an explicit directory path.
Definition orchestrator.cpp:1381

entropic::ModelOrchestrator::grammar_registry
GrammarRegistry & grammar_registry()
Access the grammar registry.
Definition orchestrator.h:286

entropic::ModelOrchestrator::generate_streaming
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > &cancel, const std::string &tier_name="")
Streaming generation.
Definition orchestrator.cpp:714

entropic::ModelOrchestrator::apply_tier_sampler_defaults_for_test
void apply_tier_sampler_defaults_for_test(GenerationParams &params, const std::string &tier_name)
Test-only forwarder to the private per-tier sampler default application (gh#94, audit task #71).
Definition orchestrator.h:523

entropic::ModelOrchestrator::loaded_models
std::vector< std::string > loaded_models() const
Currently loaded model tier names.
Definition orchestrator.cpp:1156

entropic::ModelOrchestrator::initialize
bool initialize(const ParsedConfig &config)
Initialize from parsed config.
Definition orchestrator.cpp:197

entropic::ModelOrchestrator::has_vision_capable_tier
bool has_vision_capable_tier() const
Return true if any configured tier declares the "vision" capability (gh#41, v2.1.8).
Definition orchestrator.cpp:1417

entropic::ModelOrchestrator::tier_footprint_bytes
size_t tier_footprint_bytes(const std::string &tier_name) const
Estimated VRAM footprint for a given tier in bytes.
Definition orchestrator.cpp:1714

entropic::ModelOrchestrator::adapter_manager
AdapterManager & adapter_manager()
Access the LoRA adapter manager.
Definition orchestrator.h:278

entropic::ModelOrchestrator::shutdown
void shutdown()
Shutdown — unload all models.
Definition orchestrator.cpp:247

entropic::ModelOrchestrator::last_routing_result
RoutingResult last_routing_result() const
Last routing result.
Definition orchestrator.cpp:1134

entropic::ModelOrchestrator::ResidencyObserverFn
std::function< void(ResidencyEvent event, const std::string &tier_name, const std::string &model_path, size_t footprint)> ResidencyObserverFn
Residency observer callback type (internal C++ form).
Definition orchestrator.h:416

entropic::ModelOrchestrator::generate
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams &params, const std::string &tier_name="")
Generate using routed or explicit tier.
Definition orchestrator.cpp:570

entropic::ModelOrchestrator::clear_last_residency_error
void clear_last_residency_error()
Clear last_residency_error().
Definition orchestrator.h:486

entropic::ModelOrchestrator::clear_all_prompt_caches
void clear_all_prompt_caches()
Invalidate prompt/KV caches across every pooled backend.
Definition orchestrator.cpp:1402

entropic::ModelOrchestrator::last_residency_error
entropic_error_t last_residency_error() const
Last residency-related error code, or ENTROPIC_OK if none.
Definition orchestrator.h:479

entropic::ModelOrchestrator::route
std::string route(const std::vector< Message > &messages)
Route to tier using router model.
Definition orchestrator.cpp:766

entropic::ModelOrchestrator::get_adapter
ChatAdapter * get_adapter(const std::string &tier_name) const
Get adapter for a tier.
Definition orchestrator.cpp:1219

entropic::ModelOrchestrator::set_residency_observer
void set_residency_observer(ResidencyObserverFn cb)
Register a residency observer.
Definition orchestrator.cpp:1731

entropic::ModelOrchestrator::last_used_tier
std::string last_used_tier() const
Last used tier name.
Definition orchestrator.cpp:1143

entropic::ModelOrchestrator::~ModelOrchestrator
~ModelOrchestrator()
Destructor — invokes shutdown() and AdapterManager::unload_all().
Definition orchestrator.cpp:264

entropic::ModelOrchestrator::generate_batch
std::vector< GenerationResult > generate_batch(const std::vector< std::vector< Message > > &messages_list, const std::vector< GenerationParams > &params_list, const std::vector< std::string > &tiers, std::atomic< bool > &cancel)
Same-prefix batch generation on a shared resident model (gh#98).
Definition orchestrator.cpp:672

entropic::ModelOrchestrator::select_vision_tier
std::string select_vision_tier() const
Pick the canonical vision-capable tier name (gh#41).
Definition orchestrator.cpp:1430

entropic::ModelOrchestrator::profile_registry
ProfileRegistry & profile_registry()
Access the GPU resource profile registry.
Definition orchestrator.h:294

entropic::ModelOrchestrator::throughput_tracker
ThroughputTracker & throughput_tracker()
Access the throughput tracker.
Definition orchestrator.h:302

entropic::ModelOrchestrator::set_speculative_enabled
void set_speculative_enabled(bool enabled)
Runtime toggle for the speculative-decoding path.
Definition orchestrator.h:380

entropic::ModelOrchestrator::can_handoff
bool can_handoff(const std::string &from, const std::string &to) const
Check if handoff is permitted.
Definition orchestrator.cpp:1204

entropic::ModelOrchestrator::residency_snapshot_json
std::string residency_snapshot_json() const
Serialize the current residency set as a JSON string.
Definition orchestrator.cpp:1802

entropic::ModelOrchestrator::ResidencyEvent
ResidencyEvent
Residency observer event codes — mirror the C ABI enum entropic_residency_event_t exactly (LOADED=0,...
Definition orchestrator.h:392

entropic::ModelOrchestrator::get_backend
InferenceBackend * get_backend(const std::string &tier_name) const
Get the inference backend for a tier (for evaluation APIs).
Definition orchestrator.cpp:1192

entropic::ProfileRegistry
Centralized registry for named GPU resource profiles.
Definition profile_registry.h:41

entropic::SecondaryModelLoader
Role-keyed lifecycle manager for non-primary models.
Definition secondary_model_loader.h:55

entropic::ThroughputTracker
EWMA-based throughput tracker for generation budgeting.
Definition throughput_tracker.h:43

config.h
Configuration structs with defaults.

error.h
Error types for cross-.so error reporting.

entropic_error_t
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35

ENTROPIC_OK
@ ENTROPIC_OK
Success.
Definition error.h:36

grammar_registry.h
GrammarRegistry — named grammar management and validation.

backend.h
InferenceBackend concrete base class.

entropic
Activate model on GPU (WARM → ACTIVE).
Definition bundled_models.h:20

entropic::apply_tier_sampler_overrides
ENTROPIC_EXPORT void apply_tier_sampler_overrides(GenerationParams &params, const TierSamplerOverrides &ov)
Apply per-tier sampler overrides to params.
Definition orchestrator.cpp:1606

profile_registry.h
ProfileRegistry – named GPU resource profile management.

secondary_model_loader.h
Unified lifecycle for non-primary inference backends.

entropic::GenerationParams
Generation parameters for a single inference call.
Definition config.h:302

entropic::GenerationResult
Result of a single generation call.
Definition generation_result.h:30

entropic::InferenceConfig::speculative
SpeculativeConfig speculative
Speculative decoding (gh#36)
Definition config.h:918

entropic::ModelOrchestrator::SpeculativeCompatInfo
Result of a speculative-decoding compatibility check.
Definition orchestrator.h:344

entropic::ModelOrchestrator::SpeculativeCompatInfo::compatible
bool compatible
true when speculative may proceed
Definition orchestrator.h:345

entropic::ModelOrchestrator::SpeculativeCompatInfo::diagnostic
std::string diagnostic
Reason on failure (empty on ok)
Definition orchestrator.h:346

entropic::ParsedConfig
Full parsed configuration.
Definition config.h:929

entropic::ParsedConfig::inference
InferenceConfig inference
Inference-side knobs (currently speculative decoding only).
Definition config.h:985

entropic::RoutingResult
Result metadata from a routing decision.
Definition orchestrator.h:53

entropic::RoutingResult::adapter_name
std::string adapter_name
Active adapter (empty = base model) (v1.9.2)
Definition orchestrator.h:59

entropic::RoutingResult::routing_ms
double routing_ms
Total routing time.
Definition orchestrator.h:58

entropic::RoutingResult::model_raw
std::string model_raw
Raw model output (e.g. "2")
Definition orchestrator.h:56

entropic::RoutingResult::tier_name
std::string tier_name
Selected tier.
Definition orchestrator.h:54

entropic::RoutingResult::swap_action
std::string swap_action
"none", "reused", "loaded"
Definition orchestrator.h:57

entropic::RoutingResult::previous_tier
std::string previous_tier
Previous tier (empty if first)
Definition orchestrator.h:55

entropic::RoutingResult::adapter_swap_ms
double adapter_swap_ms
Adapter swap latency (v1.9.2)
Definition orchestrator.h:60

entropic::SpeculativeConfig::enabled
bool enabled
Master switch (off by default)
Definition config.h:877

entropic::TierSamplerOverrides
Per-tier sampler overrides parsed from identity frontmatter.
Definition orchestrator.h:74

entropic::TierSamplerOverrides::top_p
std::optional< float > top_p
gh#85
Definition orchestrator.h:77

entropic::TierSamplerOverrides::temperature
std::optional< float > temperature
gh#82
Definition orchestrator.h:75

entropic::TierSamplerOverrides::min_p
std::optional< float > min_p
gh#85
Definition orchestrator.h:79

entropic::TierSamplerOverrides::presence_penalty
std::optional< float > presence_penalty
gh#85
Definition orchestrator.h:80

entropic::TierSamplerOverrides::tool_call_mode
std::optional< std::string > tool_call_mode
gh#103
Definition orchestrator.h:84

entropic::TierSamplerOverrides::frequency_penalty
std::optional< float > frequency_penalty
gh#85
Definition orchestrator.h:81

entropic::TierSamplerOverrides::top_k
std::optional< int > top_k
gh#85
Definition orchestrator.h:78

entropic::TierSamplerOverrides::enable_thinking
std::optional< bool > enable_thinking
gh#86
Definition orchestrator.h:83

entropic::TierSamplerOverrides::repeat_penalty
std::optional< float > repeat_penalty
gh#86
Definition orchestrator.h:82

entropic::TierSamplerOverrides::max_output_tokens
std::optional< int > max_output_tokens
gh#82
Definition orchestrator.h:76

throughput_tracker.h
ThroughputTracker – real-time throughput measurement and prediction.