22#include <nlohmann/json.hpp>
41std::string extract_latest_user_message(
const std::vector<Message>& messages) {
42 for (
auto it = messages.rbegin(); it != messages.rend(); ++it) {
43 if (it->role ==
"user") {
67bool ModelOrchestrator::create_tier_backends(
const ParsedConfig& config) {
68 for (
const auto& [name, tier_config] : config.models.tiers) {
69 std::string path_key = tier_config.path.string();
70 if (!std::filesystem::exists(tier_config.path)) {
71 logger->error(
"Model file not found for tier '{}': {}",
73 logger->error(
"Place a GGUF file at the path above, or set "
74 "ENTROPIC_MODEL_DIR to a directory containing "
75 "it. Run `entropic download --list` to see "
76 "bundled model keys, then "
77 "`entropic download <key>` to fetch one.");
80 if (model_pool_.find(path_key) == model_pool_.end()) {
81 model_pool_[path_key] = std::make_shared<LlamaCppBackend>();
83 tiers_[name] = model_pool_[path_key];
85 tier_config.adapter, name,
"" );
90 logger->info(
"Created {} unique backend(s) for {} tier(s)",
91 model_pool_.size(), tiers_.size());
101void ModelOrchestrator::build_routing_tables(
const ParsedConfig& config) {
102 for (
const auto& [digit, tier_name] : config.routing.tier_map) {
103 tier_map_[digit] = tier_name;
105 for (
const auto& [src, targets] : config.routing.handoff_rules) {
106 handoff_rules_[src] = std::unordered_set<std::string>(
107 targets.begin(), targets.end());
118bool ModelOrchestrator::activate_default_tier(
const ParsedConfig& config) {
119 if (tiers_.find(default_tier_) == tiers_.end()) {
return true; }
120 auto& backend = tiers_[default_tier_];
121 auto& tier_cfg = config.models.tiers.at(default_tier_);
122 if (!backend->load_and_activate(tier_cfg)) {
123 logger->error(
"Failed to activate default tier: {}", default_tier_);
126 loaded_main_tier_ = default_tier_;
127 logger->info(
"Activated default tier: {}", default_tier_);
142void ModelOrchestrator::activate_router(
const ParsedConfig& config) {
143 if (!config.models.router) {
return; }
146 secondary_loader_.
ensure_loaded(
"router", *config.models.router);
162void ModelOrchestrator::activate_draft(
const ParsedConfig& config) {
163 const auto& spec = config.inference.speculative;
164 if (!spec.enabled || spec.draft.path.empty()) {
return; }
190 vram_budget_bytes_ = resolve_vram_budget_bytes();
191 if (vram_budget_bytes_ > 0) {
192 logger->info(
"[residency] VRAM budget: {} bytes "
193 "(ENTROPIC_VRAM_BUDGET_BYTES)",
199 auto path = (config.
log_dir /
"llama_ggml.log").
string();
201 logger->info(
"ggml logging: {}", path);
204 logger->info(
"Initializing model orchestrator");
206 if (!create_tier_backends(config)) {
return false; }
207 build_routing_tables(config);
208 if (!activate_default_tier(config)) {
return false; }
209 activate_router(config);
210 activate_draft(config);
213 load_bundled_grammars();
227 logger->info(
"Shutting down model orchestrator");
229 for (
auto& [path, backend] : model_pool_) {
230 if (backend->is_loaded()) {
260 const std::vector<Message>& messages,
264 && try_speculative_route(model, messages, params, result);
266 result = model->
generate(messages, params);
281bool ModelOrchestrator::try_speculative_route_streaming(
282 InferenceBackend* model,
283 const std::vector<Message>& messages,
284 const GenerationParams& params,
285 std::function<
void(std::string_view)> on_token,
286 std::atomic<bool>& cancel,
287 GenerationResult& result)
290 bool kernel_ran =
false;
291 if (!compat.compatible) {
292 logger->info(
"Speculative requested but pair incompatible "
293 "({}); using plain decode", compat.diagnostic);
295 auto* llama_target =
dynamic_cast<LlamaCppBackend*
>(model);
296 auto* draft_be = secondary_loader_.
get(
"draft");
297 auto* llama_draft =
dynamic_cast<LlamaCppBackend*
>(draft_be);
298 if (llama_target ==
nullptr || llama_draft ==
nullptr) {
299 logger->
info(
"Speculative compat passed but target/draft "
300 "is not llama.cpp; using plain decode");
302 auto spec = llama_target->generate_speculative_with_draft(
303 messages, params, on_token, cancel, *llama_draft,
307 logger->info(
"Speculative kernel returned NOT_SUPPORTED "
308 "({}); falling back", spec.error_message);
310 result = std::move(spec);
324bool ModelOrchestrator::try_speculative_route(
325 InferenceBackend* model,
326 const std::vector<Message>& messages,
327 const GenerationParams& params,
328 GenerationResult& result)
330 std::atomic<bool> local_cancel{
false};
331 return try_speculative_route_streaming(
332 model, messages, params,
333 [](std::string_view){}, local_cancel, result);
347 if (!adapter || result.
content.empty()) {
return; }
350 result.
content = parsed.cleaned_content;
351 result.
tool_calls = std::move(parsed.tool_calls);
366 const std::string& selected,
367 const std::string& adapter_name,
369 double routing_ms,
double swap_ms) {
370 logger->info(
"Orchestration: tier={}, adapter={}, grammar={}",
371 selected, adapter_name,
372 params.
grammar.empty() ?
"unconstrained"
374 logger->info(
"Total: {:.0f}ms (route={:.0f}ms, swap={:.0f}ms, "
376 result.
total_ms, routing_ms, swap_ms,
400 const std::vector<Message>& messages,
402 const std::string& tier_name)
404 auto t_start = now();
407 std::string selected = tier_name;
408 double routing_ms = 0.0;
409 if (selected.empty()) {
410 auto t_route = now();
411 selected =
route(messages);
412 routing_ms = elapsed_ms(t_route, now());
418 double swap_ms = elapsed_ms(t_swap, now());
420 if (!model) {
return build_no_model_error(selected); }
424 resolve_grammar_key(resolved_params, selected);
428 model, messages, resolved_params);
434 result.
total_ms = elapsed_ms(t_start, now());
436 resolved_params, routing_ms, swap_ms);
454 const std::vector<Message>& messages,
456 std::function<
void(std::string_view)> on_token,
457 std::atomic<bool>& cancel,
458 const std::string& tier_name)
460 std::string selected = tier_name.empty() ?
route(messages) : tier_name;
473 resolve_grammar_key(resolved_params, selected);
483 && try_speculative_route_streaming(
484 model, messages, resolved_params, on_token, cancel,
486 return spec_streaming;
509 logger->info(
"Route: routing disabled, using default '{}'",
511 last_routing_result_ = {default_tier_,
"",
"",
"none", 0.0};
512 return default_tier_;
515 auto [tier, raw] = classify_task(messages);
516 last_routing_result_ = {tier, loaded_main_tier_, raw,
"none", 0.0};
519 tier_history_.push_back(tier);
520 if (tier_history_.size() > 5) {
521 tier_history_.erase(tier_history_.begin());
524 logger->info(
"[ROUTER] {} | raw='{}'", tier, raw);
541std::pair<std::string, std::string> ModelOrchestrator::classify_task(
542 const std::vector<Message>& messages)
544 std::string user_msg = extract_latest_user_message(messages);
550 auto* router_backend = secondary_loader_.
get(
"router");
551 if (router_backend ==
nullptr) {
552 logger->warn(
"classify_task: router not loaded; returning empty");
555 auto result = router_backend->
complete(
556 user_msg +
" ->", router_params);
557 std::string raw = result.
content;
560 auto start = raw.find_first_not_of(
" \t\n\r");
561 if (start != std::string::npos) {
562 raw = raw.substr(start);
567 std::string digit(1, c);
568 auto it = tier_map_.find(digit);
569 if (it != tier_map_.end()) {
570 logger->info(
"Route: digit='{}' -> tier='{}'",
572 return {it->second, digit};
576 logger->warn(
"Route: no valid digit in '{}', defaulting to {}",
578 return {default_tier_,
""};
601void ModelOrchestrator::record_activation_reuse(
602 const std::string& tier_name) {
603 auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
604 std::chrono::steady_clock::now() - start_time_).count();
605 bool tier_changed = (loaded_main_tier_ != tier_name);
606 tier_last_activation_ms_[tier_name] = now_ms;
607 if (!tier_changed) {
return; }
608 auto tier_it = config_.
models.
tiers.find(tier_name);
609 std::string path = tier_it != config_.
models.
tiers.end()
610 ? tier_it->second.path.string() :
"";
611 size_t footprint = tier_footprint_bytes_.count(tier_name)
612 ? tier_footprint_bytes_[tier_name]
613 : estimate_footprint_bytes(tier_name);
614 tier_footprint_bytes_[tier_name] = footprint;
615 loaded_main_tier_ = tier_name;
616 fire_residency_observer(ResidencyEvent::ActivationSwap,
617 tier_name, path, footprint);
630bool ModelOrchestrator::residency_admits(
const std::string& tier_name) {
631 size_t footprint = estimate_footprint_bytes(tier_name);
633 tier_footprint_bytes_[tier_name] = footprint;
635 if (vram_budget_bytes_ > 0 && footprint > vram_budget_bytes_) {
636 logger->error(
"[residency] tier '{}' footprint {} bytes "
637 "exceeds VRAM budget {} bytes — "
638 "TIER_MODEL_TOO_LARGE (gh#57)",
639 tier_name, footprint, vram_budget_bytes_);
665GenerationResult ModelOrchestrator::build_no_model_error(
666 const std::string& tier_name) {
667 GenerationResult err;
668 err.finish_reason =
"error";
670 err.error_code = last_residency_error_;
671 err.error_message =
"Tier '" + tier_name +
"' model exceeds the "
672 "engine's VRAM budget (gh#57)";
676 err.error_message =
"No model available for tier: " + tier_name;
694InferenceBackend* ModelOrchestrator::activate_and_track(
695 const std::string& tier_name,
696 const std::shared_ptr<InferenceBackend>& backend) {
697 auto tier_it = config_.
models.
tiers.find(tier_name);
698 bool activated = tier_it != config_.
models.
tiers.end()
699 && backend->load_and_activate(tier_it->second);
701 logger->error(
"Failed to activate tier: {}", tier_name);
704 loaded_main_tier_ = tier_name;
706 auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
707 std::chrono::steady_clock::now() - start_time_).count();
708 tier_last_activation_ms_[tier_name] = now_ms;
709 size_t footprint = tier_footprint_bytes_.count(tier_name)
710 ? tier_footprint_bytes_[tier_name] : 0;
711 fire_residency_observer(ResidencyEvent::Loaded,
712 tier_name, tier_it->second.path.string(),
714 return backend.get();
730InferenceBackend* ModelOrchestrator::get_model(
const std::string& tier_name) {
731 std::lock_guard<std::mutex> lock(swap_mutex_);
733 auto it = tiers_.find(tier_name);
734 std::string effective_tier = tier_name;
735 if (it == tiers_.end()) {
737 if (it != tiers_.end()) {
742 InferenceBackend* result =
nullptr;
743 if (it != tiers_.end() && it->second->is_active()) {
745 record_activation_reuse(effective_tier);
746 result = it->second.get();
747 }
else if (it != tiers_.end() && residency_admits(effective_tier)) {
748 deactivate_current_if_needed(it->second.get());
749 result = activate_and_track(effective_tier, it->second);
754 ensure_tier_lora(tier_name, result);
767void ModelOrchestrator::ensure_tier_lora(
const std::string& tier_name,
768 InferenceBackend* result) {
769 auto* llama_backend =
dynamic_cast<LlamaCppBackend*
>(result);
770 llama_context* ctx = llama_backend
771 ? llama_backend->llama_context_ptr() :
nullptr;
772 double adapter_ms = ensure_adapter_for_tier(tier_name, ctx);
788void ModelOrchestrator::deactivate_current_if_needed(InferenceBackend* incoming) {
789 auto it = loaded_main_tier_.empty()
790 ? tiers_.end() : tiers_.find(loaded_main_tier_);
792 bool should_swap = it != tiers_.end()
793 && it->second.get() != incoming
794 && it->second->is_loaded();
801 auto* llama_backend =
dynamic_cast<LlamaCppBackend*
>(it->second.get());
804 llama_backend->llama_model_ptr(),
805 llama_backend->llama_context_ptr());
808 unload_or_warm_current(it->second.get());
817void ModelOrchestrator::unload_or_warm_current(InferenceBackend* current) {
818 auto cfg_it = config_.
models.
tiers.find(loaded_main_tier_);
819 bool keep_warm = cfg_it != config_.
models.
tiers.end()
820 && cfg_it->second.keep_warm;
823 logger->info(
"Deactivating {} (keep_warm=true)", loaded_main_tier_);
824 current->deactivate();
827 logger->info(
"Unloading {} (keep_warm=false)", loaded_main_tier_);
828 std::string path = cfg_it != config_.
models.
tiers.end()
829 ? cfg_it->second.path.string() :
"";
830 size_t footprint = tier_footprint_bytes_.count(loaded_main_tier_)
831 ? tier_footprint_bytes_[loaded_main_tier_] : 0;
832 std::string evicted_tier = loaded_main_tier_;
834 fire_residency_observer(ResidencyEvent::Evicted,
835 evicted_tier, path, footprint);
846 return last_routing_result_;
855 return loaded_main_tier_;
868 std::vector<std::string> result;
869 for (
const auto& [name, backend] : tiers_) {
870 if (backend->is_loaded()) {
871 result.push_back(name);
874 if (secondary_loader_.
is_loaded(
"router")) {
875 result.push_back(
"router");
886 std::vector<std::string> result;
887 for (
const auto& [name, _] : tiers_) {
888 result.push_back(name);
891 result.push_back(
"router");
904 const std::string& tier_name)
const {
905 auto it = tiers_.find(tier_name);
906 if (it == tiers_.end()) {
return nullptr; }
907 return it->second.get();
916 const std::string& from,
const std::string& to)
const
918 auto it = handoff_rules_.find(from);
919 if (it == handoff_rules_.end()) {
922 return it->second.count(to) > 0;
931 auto it = adapters_.find(tier_name);
932 if (it != adapters_.end()) {
933 return it->second.get();
960bool ModelOrchestrator::deactivate_if_active(llama_context* ctx) {
976double ModelOrchestrator::ensure_adapter_for_tier(
977 const std::string& tier_name, llama_context* ctx)
979 auto tier_it = config_.
models.
tiers.find(tier_name);
984 const auto& tier_cfg = tier_it->second;
985 auto t_start =
now();
986 bool needs_kv_clear =
false;
988 if (!tier_cfg.adapter_path) {
989 needs_kv_clear = deactivate_if_active(ctx);
991 needs_kv_clear = lora_manager_.
swap(tier_name, ctx);
992 if (!needs_kv_clear) {
993 logger->warn(
"Adapter swap to '{}' failed", tier_name);
997 if (needs_kv_clear && ctx) {
998 llama_memory_clear(llama_get_memory(ctx),
true);
999 logger->info(
"Adapter swap for tier '{}' in {:.1f}ms",
1015void ModelOrchestrator::preload_adapters() {
1018 for (
const auto& [name, tier_cfg] : config_.models.tiers) {
1019 if (!tier_cfg.adapter_path) {
1023 auto tier_it = tiers_.find(name);
1024 if (tier_it == tiers_.end()) {
1028 auto* llama_backend =
dynamic_cast<LlamaCppBackend*
>(
1029 tier_it->second.get());
1030 if (!llama_backend || !llama_backend->llama_model_ptr()) {
1031 logger->warn(
"Cannot preload adapter for '{}' — model not loaded",
1036 bool ok = lora_manager_.
load(
1038 *tier_cfg.adapter_path,
1039 llama_backend->llama_model_ptr(),
1040 tier_cfg.adapter_scale);
1048 logger->info(
"Preloaded {} LoRA adapter(s) to WARM", loaded);
1063void ModelOrchestrator::load_bundled_grammars() {
1064 std::filesystem::path grammar_dir;
1066 grammar_dir = config_.
config_dir /
"grammars";
1068 if (grammar_dir.empty() || !std::filesystem::is_directory(grammar_dir)) {
1071 logger->info(
"No bundled grammar directory found, skipping");
1075 size_t count = grammar_registry_.
load_bundled(grammar_dir);
1076 logger->info(
"Grammar registry: {} grammar(s) loaded from {}",
1077 count, grammar_dir.string());
1093 const std::filesystem::path& grammar_dir) {
1094 if (!std::filesystem::is_directory(grammar_dir)) {
1097 auto count = grammar_registry_.
load_bundled(grammar_dir);
1098 logger->info(
"Grammar registry: {} grammar(s) loaded from {}",
1099 count, grammar_dir.string());
1114 for (
auto& [_, backend] : model_pool_) {
1115 if (backend) { backend->clear_prompt_cache(); }
1118 logger->info(
"Prompt caches invalidated across all backends "
1119 "(identity change)");
1129 for (
const auto& [_, tier] : config_.
models.
tiers) {
1130 if (tier.has_capability(
"vision")) {
return true; }
1142 for (
const auto& [name, tier] : config_.
models.
tiers) {
1143 if (tier.has_capability(
"vision")) {
return name; }
1157 const std::shared_ptr<InferenceBackend>& tier_backend) {
1158 if (!tier_backend || !tier_backend->is_loaded()) {
1175std::string ModelOrchestrator::resolve_speculative_pair(
1176 llama_model*& target_out, llama_model*& draft_out)
const {
1177 target_out =
nullptr;
1178 draft_out =
nullptr;
1181 auto tier_it = tiers_.find(loaded_main_tier_);
1182 if (tier_it == tiers_.end()) {
1183 err =
"no main tier loaded";
1186 if (target_out ==
nullptr) {
1187 err =
"main tier backend is not a llama.cpp backend or "
1190 auto* draft_backend = secondary_loader_.
get(
"draft");
1191 if (draft_backend ==
nullptr || !draft_backend->is_loaded()) {
1192 err =
"no draft model configured for speculative "
1194 "(set inference.speculative.draft_model)";
1196 auto* d =
dynamic_cast<LlamaCppBackend*
>(draft_backend);
1197 draft_out = (d ==
nullptr) ?
nullptr : d->llama_model_ptr();
1198 if (draft_out ==
nullptr) {
1199 err =
"draft backend is not a llama.cpp backend";
1218ModelOrchestrator::SpeculativeCompatInfo
1221 llama_model* target_model =
nullptr;
1222 llama_model* draft_model =
nullptr;
1223 info.diagnostic = resolve_speculative_pair(target_model, draft_model);
1224 if (info.diagnostic.empty()) {
1225 auto result = entropic::speculative::check_compat(
1226 target_model, draft_model);
1227 info.compatible = result.compatible;
1228 info.diagnostic = std::move(result.diagnostic);
1245 std::filesystem::path p(grammar_value);
1246 if (p.extension() ==
".gbnf") {
1247 return p.stem().string();
1249 return grammar_value;
1266void ModelOrchestrator::resolve_grammar_key(
1267 GenerationParams& params,
const std::string& tier_name)
1269 if (!params.grammar.empty()) {
1274 std::string key = params.grammar_key;
1279 if (it != config_.
models.
tiers.end() && it->second.grammar) {
1288 std::string content = grammar_registry_.
get(key);
1289 if (content.empty()) {
1290 logger->warn(
"Grammar key '{}' not found in registry", key);
1294 logger->info(
"Grammar resolved: key='{}', {} bytes",
1295 key, content.size());
1296 params.grammar = std::move(content);
1311size_t ModelOrchestrator::resolve_vram_budget_bytes() {
1312 const char* env = std::getenv(
"ENTROPIC_VRAM_BUDGET_BYTES");
1313 if (env ==
nullptr || *env ==
'\0') {
return 0; }
1315 long long v = std::stoll(env);
1316 return (v < 0) ? 0 :
static_cast<size_t>(v);
1332size_t ModelOrchestrator::estimate_footprint_bytes(
1333 const std::string& tier_name)
const {
1334 auto tier_it = config_.
models.
tiers.find(tier_name);
1335 if (tier_it == config_.
models.
tiers.end()) {
return 0; }
1336 const auto& tier_cfg = tier_it->second;
1338 auto weights = std::filesystem::file_size(tier_cfg.path, ec);
1339 if (ec) {
return 0; }
1340 const size_t kv_per_token = 16ull * 1024ull;
1341 size_t kv =
static_cast<size_t>(tier_cfg.context_length) * kv_per_token;
1343 * 1024ull * 1024ull;
1344 return static_cast<size_t>(weights) + kv + headroom;
1353 const std::string& tier_name)
const {
1354 std::lock_guard<std::mutex> lock(swap_mutex_);
1355 auto it = tier_footprint_bytes_.find(tier_name);
1356 if (it != tier_footprint_bytes_.end()) {
return it->second; }
1357 size_t v = estimate_footprint_bytes(tier_name);
1359 tier_footprint_bytes_[tier_name] = v;
1370 std::lock_guard<std::mutex> lock(swap_mutex_);
1371 residency_observer_ = std::move(cb);
1379void ModelOrchestrator::fire_residency_observer(
1380 ResidencyEvent event,
1381 const std::string& tier_name,
1382 const std::string& model_path,
1384 const char* event_name =
"unknown";
1386 case ResidencyEvent::Loaded: event_name =
"loaded";
break;
1387 case ResidencyEvent::Evicted: event_name =
"evicted";
break;
1388 case ResidencyEvent::ActivationSwap: event_name =
"activation_swap";
break;
1390 logger->info(
"[residency] {} tier='{}' path='{}' footprint={} bytes",
1391 event_name, tier_name, model_path, footprint);
1392 if (residency_observer_) {
1393 residency_observer_(event, tier_name, model_path, footprint);
1415 const std::string& name,
const std::filesystem::path& path,
1416 int context_length,
size_t footprint,
int vram_reserve_mb,
1417 long long last_ms) {
1419 auto weights = std::filesystem::file_size(path, ec);
1420 size_t weights_b = ec ? 0u :
static_cast<size_t>(weights);
1421 size_t kv =
static_cast<size_t>(context_length) * 16ull * 1024ull;
1422 size_t headroom =
static_cast<size_t>(vram_reserve_mb)
1423 * 1024ull * 1024ull;
1426 {
"model_path", path.string()},
1427 {
"footprint_bytes", footprint},
1428 {
"weights_bytes", weights_b},
1429 {
"kv_cache_bytes", kv},
1430 {
"headroom_bytes", headroom},
1431 {
"last_activation_ms", last_ms}
1441 std::lock_guard<std::mutex> lock(swap_mutex_);
1443 j[
"vram_total_bytes"] = vram_budget_bytes_;
1444 j[
"vram_budget_bytes"] = vram_budget_bytes_;
1446 nlohmann::json arr = nlohmann::json::array();
1447 for (
const auto& [name, backend] : tiers_) {
1448 if (!backend || !backend->is_loaded()) {
continue; }
1450 if (tier_it == config_.
models.
tiers.end()) {
continue; }
1451 auto fp_it = tier_footprint_bytes_.find(name);
1452 size_t footprint = (fp_it != tier_footprint_bytes_.end())
1453 ? fp_it->second : estimate_footprint_bytes(name);
1454 in_use += footprint;
1455 auto la = tier_last_activation_ms_.find(name);
1456 long long last_ms = (la != tier_last_activation_ms_.end())
1459 name, tier_it->second.path, tier_it->second.context_length,
1462 j[
"residency"] = std::move(arr);
1463 j[
"vram_headroom_bytes"] = vram_budget_bytes_ > in_use
1464 ? vram_budget_bytes_ - in_use
1466 j[
"backend"] = vram_budget_bytes_ > 0 ?
"configured" :
"unknown";
Adapter factory — create adapters by name.
bool swap(const std::string &name, llama_context *ctx)
Swap to a different adapter atomically.
std::string active_adapter() const
Get the currently HOT adapter name.
void unload_all_for_model(llama_model *model, llama_context *ctx)
Unload all adapters for a given base model.
void deactivate(llama_context *ctx)
Deactivate current HOT adapter (HOT -> WARM).
bool load(const std::string &name, const std::filesystem::path &adapter_path, llama_model *model, float scale=1.0f)
Load a LoRA adapter into RAM (COLD -> WARM).
void unload_all()
Free every loaded adapter handle (gh#58 close-out, v2.3.0).
Concrete base class for chat format adapters (80% logic).
virtual ParseResult parse_tool_calls(const std::string &content) const =0
Parse tool calls from model output.
size_t load_bundled(const std::filesystem::path &grammar_dir)
Load all bundled grammars from a directory.
std::string get(const std::string &key) const
Get GBNF content string for a grammar key.
Concrete base class for inference backends (80% logic).
BackendInfo info() const
Get backend metadata.
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams ¶ms)
Generate a complete response.
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate with per-token streaming callback.
GenerationResult complete(const std::string &prompt, const GenerationParams ¶ms)
Raw text completion without chat template.
LlamaCppBackend — common llama.cpp patterns (15% layer).
llama_model * llama_model_ptr()
Get the loaded llama_model pointer.
SpeculativeCompatInfo check_speculative_compat() const
Check whether the currently-configured target/draft pair is compatible for speculative decoding.
std::vector< std::string > available_models() const
All configured tier names.
size_t load_grammars_from(const std::filesystem::path &grammar_dir)
Load grammars from an explicit directory path.
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams ¶ms, std::function< void(std::string_view)> on_token, std::atomic< bool > &cancel, const std::string &tier_name="")
Streaming generation.
std::vector< std::string > loaded_models() const
Currently loaded model tier names.
bool initialize(const ParsedConfig &config)
Initialize from parsed config.
bool has_vision_capable_tier() const
Return true if any configured tier declares the "vision" capability (gh#41, v2.1.8).
size_t tier_footprint_bytes(const std::string &tier_name) const
Estimated VRAM footprint for a given tier in bytes.
void shutdown()
Shutdown — unload all models.
RoutingResult last_routing_result() const
Last routing result.
std::function< void(ResidencyEvent event, const std::string &tier_name, const std::string &model_path, size_t footprint)> ResidencyObserverFn
Residency observer callback type (internal C++ form).
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams ¶ms, const std::string &tier_name="")
Generate using routed or explicit tier.
void clear_all_prompt_caches()
Invalidate prompt/KV caches across every pooled backend.
std::string route(const std::vector< Message > &messages)
Route to tier using router model.
ChatAdapter * get_adapter(const std::string &tier_name) const
Get adapter for a tier.
void set_residency_observer(ResidencyObserverFn cb)
Register a residency observer.
std::string last_used_tier() const
Last used tier name.
~ModelOrchestrator()
Destructor — invokes shutdown() and AdapterManager::unload_all().
std::string select_vision_tier() const
Pick the canonical vision-capable tier name (gh#41).
bool can_handoff(const std::string &from, const std::string &to) const
Check if handoff is permitted.
std::string residency_snapshot_json() const
Serialize the current residency set as a JSON string.
InferenceBackend * get_backend(const std::string &tier_name) const
Get the inference backend for a tier (for evaluation APIs).
void clear_all_prompt_caches()
Fanout: clear prompt/KV cache on every loaded backend.
bool is_loaded(const std::string &role) const
Check whether a role is currently loaded and active.
void shutdown()
Unload every role.
InferenceBackend * get(const std::string &role) const
Get the backend for a role.
bool ensure_loaded(const std::string &role, const ModelConfig &config)
Lazily load and activate a model for a role.
@ ENTROPIC_ERROR_TIER_MODEL_TOO_LARGE
A single tier's model weights+KV exceed the engine's VRAM budget; eviction cannot help (v2....
@ ENTROPIC_ERROR_NOT_SUPPORTED
Capability not supported by this backend (v1.9.13)
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
Pure C interface contract for inference backends.
void entropic_inference_log_to_file(const char *path)
Redirect llama/ggml logs to a file.
LlamaCppBackend — llama.cpp C API integration.
spdlog initialization and logger access.
auto now()
Get current time for timing measurements.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
double elapsed_ms(std::chrono::steady_clock::time_point start, std::chrono::steady_clock::time_point end)
Compute elapsed milliseconds between two time points.
Activate model on GPU (WARM → ACTIVE).
static nlohmann::json make_residency_entry(const std::string &name, const std::filesystem::path &path, int context_length, size_t footprint, int vram_reserve_mb, long long last_ms)
JSON serialization of the current residency set.
@ ok
Tool dispatched, returned non-empty content.
static void log_orchestration(const GenerationResult &result, const std::string &selected, const std::string &adapter_name, const GenerationParams ¶ms, double routing_ms, double swap_ms)
Log the per-orchestration tier/adapter/timing summary.
static llama_model * resolve_target_model(const std::shared_ptr< InferenceBackend > &tier_backend)
Resolve the active main-tier llama_model* for compat lookup.
std::unique_ptr< ChatAdapter > create_adapter(const std::string &name, const std::string &tier_name, const std::string &identity_prompt)
Create adapter by name.
static std::string normalize_grammar_key(const std::string &grammar_value)
Normalize a frontmatter grammar value to a registry key.
static void apply_adapter_parse(ChatAdapter *adapter, GenerationResult &result)
Run the tier's adapter over a result to split tool calls.
ModelOrchestrator — multi-model lifecycle and routing.
Tokenizer/architecture compatibility check for speculative decoding draft pairing.
Generation parameters for a single inference call.
std::string grammar
GBNF grammar string (empty = unconstrained)
float temperature
Sampling temperature.
std::string grammar_key
Grammar registry key.
int max_tokens
Maximum tokens to generate.
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
double swap_ms
Model swap time.
double routing_ms
Router classification time.
double generation_time_ms
Wall-clock generation time.
std::string raw_content
Raw model output before adapter processing.
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string content
Generated text (cleaned by adapter)
std::vector< ToolCall > tool_calls
Tool calls parsed from content.
std::string error_message
Error description (empty if no error)
double total_ms
Total end-to-end time.
SpeculativeConfig speculative
Speculative decoding (gh#36)
std::filesystem::path path
Resolved model file path.
Result of a speculative-decoding compatibility check.
std::optional< ModelConfig > router
Router model (separate from tiers)
std::unordered_map< std::string, TierConfig > tiers
Tier name → config.
std::string default_tier
Default tier name.
Full parsed configuration.
int vram_reserve_mb
Reserved VRAM headroom (MB, 0–65536)
RoutingConfig routing
Routing rules.
InferenceConfig inference
Inference-side knobs (currently speculative decoding only).
ModelsConfig models
Tiers + router.
std::filesystem::path log_dir
Session log directory (session.log + session_model.log).
bool ggml_logging
Enable ggml/llama.cpp logging to llama_ggml.log in log_dir.
std::filesystem::path config_dir
Config dir — base for bundled data discovery.
std::string fallback_tier
Fallback when routing fails.
bool enabled
Enable routing.
Result metadata from a routing decision.
std::string adapter_name
Active adapter (empty = base model) (v1.9.2)
std::string swap_action
"none", "reused", "loaded"
double adapter_swap_ms
Adapter swap latency (v1.9.2)
bool enabled
Master switch (off by default)
int n_draft
Window size (proposed tokens)
ModelConfig draft
Full ModelConfig for the draft model.