Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
orchestrator.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
17
18#include "llama_cpp_backend.h"
20
21#include <llama.h>
22#include <nlohmann/json.hpp>
23
24#include <cstdlib>
25#include <filesystem>
26
27namespace entropic {
28
29namespace {
30auto logger = entropic::log::get("inference.orchestrator");
33
41std::string extract_latest_user_message(const std::vector<Message>& messages) {
42 for (auto it = messages.rbegin(); it != messages.rend(); ++it) {
43 if (it->role == "user") {
44 return it->content;
45 }
46 }
47 return "";
48}
49
50} // anonymous namespace
51
52// ── Initialization ─────────────────────────────────────────
53
67bool ModelOrchestrator::create_tier_backends(const ParsedConfig& config) {
68 for (const auto& [name, tier_config] : config.models.tiers) {
69 std::string path_key = tier_config.path.string();
70 if (!std::filesystem::exists(tier_config.path)) {
71 logger->error("Model file not found for tier '{}': {}",
72 name, path_key);
73 logger->error("Place a GGUF file at the path above, or set "
74 "ENTROPIC_MODEL_DIR to a directory containing "
75 "it. Run `entropic download --list` to see "
76 "bundled model keys, then "
77 "`entropic download <key>` to fetch one.");
78 return false;
79 }
80 if (model_pool_.find(path_key) == model_pool_.end()) {
81 model_pool_[path_key] = std::make_shared<LlamaCppBackend>();
82 }
83 tiers_[name] = model_pool_[path_key];
84 adapters_[name] = create_adapter(
85 tier_config.adapter, name, "" /* prompt resolved later */);
86 }
87 // Router backend instantiation moved to SecondaryModelLoader
88 // (gh#27, v2.1.11). The loader allocates the role slot lazily on
89 // first ensure_loaded() call from activate_router().
90 logger->info("Created {} unique backend(s) for {} tier(s)",
91 model_pool_.size(), tiers_.size());
92 return true;
93}
94
101void ModelOrchestrator::build_routing_tables(const ParsedConfig& config) {
102 for (const auto& [digit, tier_name] : config.routing.tier_map) {
103 tier_map_[digit] = tier_name;
104 }
105 for (const auto& [src, targets] : config.routing.handoff_rules) {
106 handoff_rules_[src] = std::unordered_set<std::string>(
107 targets.begin(), targets.end());
108 }
109}
110
118bool ModelOrchestrator::activate_default_tier(const ParsedConfig& config) {
119 if (tiers_.find(default_tier_) == tiers_.end()) { return true; }
120 auto& backend = tiers_[default_tier_];
121 auto& tier_cfg = config.models.tiers.at(default_tier_);
122 if (!backend->load_and_activate(tier_cfg)) {
123 logger->error("Failed to activate default tier: {}", default_tier_);
124 return false;
125 }
126 loaded_main_tier_ = default_tier_;
127 logger->info("Activated default tier: {}", default_tier_);
128 return true;
129}
130
142void ModelOrchestrator::activate_router(const ParsedConfig& config) {
143 if (!config.models.router) { return; }
144 // Lifecycle now lives on SecondaryModelLoader (gh#27, v2.1.11).
145 // Diagnostic-level logging is emitted by the loader itself.
146 secondary_loader_.ensure_loaded("router", *config.models.router);
147}
148
162void ModelOrchestrator::activate_draft(const ParsedConfig& config) {
163 const auto& spec = config.inference.speculative;
164 if (!spec.enabled || spec.draft.path.empty()) { return; }
165 // Full ModelConfig comes from the YAML's
166 // `inference.speculative.draft:` block — every llama.cpp knob is
167 // consumer-tunable. Defaults come from
168 // `make_default_draft_model_config()` (gpu_layers=0,
169 // flash_attn=false, context_length=8192, n_threads=4).
170 secondary_loader_.ensure_loaded("draft", spec.draft);
171}
172
188 config_ = config;
189 default_tier_ = config.models.default_tier;
190 vram_budget_bytes_ = resolve_vram_budget_bytes();
191 if (vram_budget_bytes_ > 0) {
192 logger->info("[residency] VRAM budget: {} bytes "
193 "(ENTROPIC_VRAM_BUDGET_BYTES)",
194 vram_budget_bytes_);
195 }
196
197 // Route ggml/llama logs before any model loading
198 if (config.ggml_logging && !config.log_dir.empty()) {
199 auto path = (config.log_dir / "llama_ggml.log").string();
200 entropic_inference_log_to_file(path.c_str());
201 logger->info("ggml logging: {}", path);
202 }
203
204 logger->info("Initializing model orchestrator");
205
206 if (!create_tier_backends(config)) { return false; }
207 build_routing_tables(config);
208 if (!activate_default_tier(config)) { return false; }
209 activate_router(config);
210 activate_draft(config); // Speculative draft slot (v2.1.11)
211
212 preload_adapters(); // LoRA adapters → WARM (v1.9.2)
213 load_bundled_grammars(); // Bundled grammars (v1.9.3)
214 return true;
215}
216
227 logger->info("Shutting down model orchestrator");
228
229 for (auto& [path, backend] : model_pool_) {
230 if (backend->is_loaded()) {
231 backend->unload();
232 }
233 }
234
235 secondary_loader_.shutdown();
236}
237
244 // Order matters (gh#58 close-out, v2.3.0):
245 // 1. Backends first → frees llama_contexts.
246 // 2. LoRA adapter handles after → safe because the contexts
247 // that may have held HOT adapter references are gone.
248 shutdown();
249 lora_manager_.unload_all();
250}
251
258GenerationResult ModelOrchestrator::run_generate_dispatch(
259 InferenceBackend* model,
260 const std::vector<Message>& messages,
261 const GenerationParams& params) {
262 GenerationResult result;
263 bool kernel_ran = config_.inference.speculative.enabled
264 && try_speculative_route(model, messages, params, result);
265 if (!kernel_ran) {
266 result = model->generate(messages, params);
267 }
268 return result;
269}
270
281bool ModelOrchestrator::try_speculative_route_streaming(
282 InferenceBackend* model,
283 const std::vector<Message>& messages,
284 const GenerationParams& params,
285 std::function<void(std::string_view)> on_token,
286 std::atomic<bool>& cancel,
287 GenerationResult& result)
288{
289 auto compat = check_speculative_compat();
290 bool kernel_ran = false;
291 if (!compat.compatible) {
292 logger->info("Speculative requested but pair incompatible "
293 "({}); using plain decode", compat.diagnostic);
294 } else {
295 auto* llama_target = dynamic_cast<LlamaCppBackend*>(model);
296 auto* draft_be = secondary_loader_.get("draft");
297 auto* llama_draft = dynamic_cast<LlamaCppBackend*>(draft_be);
298 if (llama_target == nullptr || llama_draft == nullptr) {
299 logger->info("Speculative compat passed but target/draft "
300 "is not llama.cpp; using plain decode");
301 } else {
302 auto spec = llama_target->generate_speculative_with_draft(
303 messages, params, on_token, cancel, *llama_draft,
305 config_.inference.speculative.draft.path.string());
306 if (spec.error_code == ENTROPIC_ERROR_NOT_SUPPORTED) {
307 logger->info("Speculative kernel returned NOT_SUPPORTED "
308 "({}); falling back", spec.error_message);
309 } else {
310 result = std::move(spec);
311 kernel_ran = true;
312 }
313 }
314 }
315 return kernel_ran;
316}
317
324bool ModelOrchestrator::try_speculative_route(
325 InferenceBackend* model,
326 const std::vector<Message>& messages,
327 const GenerationParams& params,
328 GenerationResult& result)
329{
330 std::atomic<bool> local_cancel{false};
331 return try_speculative_route_streaming(
332 model, messages, params,
333 [](std::string_view){}, local_cancel, result);
334}
335
336// ── Generation ─────────────────────────────────────────────
337
345static void apply_adapter_parse(ChatAdapter* adapter,
346 GenerationResult& result) {
347 if (!adapter || result.content.empty()) { return; }
348 result.raw_content = result.content;
349 auto parsed = adapter->parse_tool_calls(result.content);
350 result.content = parsed.cleaned_content;
351 result.tool_calls = std::move(parsed.tool_calls);
352}
353
365static void log_orchestration(const GenerationResult& result,
366 const std::string& selected,
367 const std::string& adapter_name,
368 const GenerationParams& params,
369 double routing_ms, double swap_ms) {
370 logger->info("Orchestration: tier={}, adapter={}, grammar={}",
371 selected, adapter_name,
372 params.grammar.empty() ? "unconstrained"
373 : params.grammar_key);
374 logger->info("Total: {:.0f}ms (route={:.0f}ms, swap={:.0f}ms, "
375 "gen={:.0f}ms)",
376 result.total_ms, routing_ms, swap_ms,
377 result.generation_time_ms);
378}
379
400 const std::vector<Message>& messages,
401 const GenerationParams& params,
402 const std::string& tier_name)
403{
404 auto t_start = now();
405
406 // Route if no explicit tier
407 std::string selected = tier_name;
408 double routing_ms = 0.0;
409 if (selected.empty()) {
410 auto t_route = now();
411 selected = route(messages);
412 routing_ms = elapsed_ms(t_route, now());
413 }
414
415 // Get model (may trigger swap)
416 auto t_swap = now();
417 InferenceBackend* model = get_model(selected);
418 double swap_ms = elapsed_ms(t_swap, now());
419
420 if (!model) { return build_no_model_error(selected); }
421
422 // Resolve grammar_key → grammar content (v1.9.3)
423 GenerationParams resolved_params = params;
424 resolve_grammar_key(resolved_params, selected);
425
426 // Generate — speculative routing applies here too (v2.1.11, gh#36)
427 GenerationResult result = run_generate_dispatch(
428 model, messages, resolved_params);
429
430 apply_adapter_parse(get_adapter(selected), result);
431
432 result.routing_ms = routing_ms;
433 result.swap_ms = swap_ms;
434 result.total_ms = elapsed_ms(t_start, now());
435 log_orchestration(result, selected, last_routing_result_.adapter_name,
436 resolved_params, routing_ms, swap_ms);
437 return result;
438}
439
454 const std::vector<Message>& messages,
455 const GenerationParams& params,
456 std::function<void(std::string_view)> on_token,
457 std::atomic<bool>& cancel,
458 const std::string& tier_name)
459{
460 std::string selected = tier_name.empty() ? route(messages) : tier_name;
461 InferenceBackend* model = get_model(selected);
462
463 if (!model) {
466 err.error_message = "No model for tier: " + selected;
467 err.finish_reason = "error";
468 return err;
469 }
470
471 // Resolve grammar_key → grammar content (v1.9.3)
472 GenerationParams resolved_params = params;
473 resolve_grammar_key(resolved_params, selected);
474
475 // Speculative routing (v2.1.11, gh#36): when speculative is
476 // enabled in config AND target/draft pair is compatible, attempt
477 // the speculative kernel. On NOT_SUPPORTED (kernel staged), fall
478 // back to plain streaming. This keeps the v2.1.11 ship-without-
479 // kernel state observable as "plain decode, speculative
480 // requested but deferred."
481 GenerationResult spec_streaming;
482 if (config_.inference.speculative.enabled
483 && try_speculative_route_streaming(
484 model, messages, resolved_params, on_token, cancel,
485 spec_streaming)) {
486 return spec_streaming;
487 }
488
489 return model->generate_streaming(messages, resolved_params, on_token, cancel);
490}
491
492// ── Routing ────────────────────────────────────────────────
493
506std::string ModelOrchestrator::route(const std::vector<Message>& messages) {
507 if (!config_.routing.enabled
508 || !config_.models.router.has_value()) {
509 logger->info("Route: routing disabled, using default '{}'",
510 default_tier_);
511 last_routing_result_ = {default_tier_, "", "", "none", 0.0};
512 return default_tier_;
513 }
514
515 auto [tier, raw] = classify_task(messages);
516 last_routing_result_ = {tier, loaded_main_tier_, raw, "none", 0.0};
517
518 // Track history
519 tier_history_.push_back(tier);
520 if (tier_history_.size() > 5) {
521 tier_history_.erase(tier_history_.begin());
522 }
523
524 logger->info("[ROUTER] {} | raw='{}'", tier, raw);
525 return tier;
526}
527
541std::pair<std::string, std::string> ModelOrchestrator::classify_task(
542 const std::vector<Message>& messages)
543{
544 std::string user_msg = extract_latest_user_message(messages);
545
546 GenerationParams router_params;
547 router_params.max_tokens = 1;
548 router_params.temperature = 0.0f;
549
550 auto* router_backend = secondary_loader_.get("router");
551 if (router_backend == nullptr) {
552 logger->warn("classify_task: router not loaded; returning empty");
553 return {"", ""};
554 }
555 auto result = router_backend->complete(
556 user_msg + " ->", router_params);
557 std::string raw = result.content;
558
559 // Trim whitespace
560 auto start = raw.find_first_not_of(" \t\n\r");
561 if (start != std::string::npos) {
562 raw = raw.substr(start);
563 }
564
565 // Find matching tier
566 for (char c : raw) {
567 std::string digit(1, c);
568 auto it = tier_map_.find(digit);
569 if (it != tier_map_.end()) {
570 logger->info("Route: digit='{}' -> tier='{}'",
571 digit, it->second);
572 return {it->second, digit};
573 }
574 }
575
576 logger->warn("Route: no valid digit in '{}', defaulting to {}",
577 raw, default_tier_);
578 return {default_tier_, ""};
579}
580
581// ── Model access ───────────────────────────────────────────
582
601void ModelOrchestrator::record_activation_reuse(
602 const std::string& tier_name) {
603 auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
604 std::chrono::steady_clock::now() - start_time_).count();
605 bool tier_changed = (loaded_main_tier_ != tier_name);
606 tier_last_activation_ms_[tier_name] = now_ms;
607 if (!tier_changed) { return; }
608 auto tier_it = config_.models.tiers.find(tier_name);
609 std::string path = tier_it != config_.models.tiers.end()
610 ? tier_it->second.path.string() : "";
611 size_t footprint = tier_footprint_bytes_.count(tier_name)
612 ? tier_footprint_bytes_[tier_name]
613 : estimate_footprint_bytes(tier_name);
614 tier_footprint_bytes_[tier_name] = footprint;
615 loaded_main_tier_ = tier_name;
616 fire_residency_observer(ResidencyEvent::ActivationSwap,
617 tier_name, path, footprint);
618}
619
630bool ModelOrchestrator::residency_admits(const std::string& tier_name) {
631 size_t footprint = estimate_footprint_bytes(tier_name);
632 if (footprint > 0) {
633 tier_footprint_bytes_[tier_name] = footprint;
634 }
635 if (vram_budget_bytes_ > 0 && footprint > vram_budget_bytes_) {
636 logger->error("[residency] tier '{}' footprint {} bytes "
637 "exceeds VRAM budget {} bytes — "
638 "TIER_MODEL_TOO_LARGE (gh#57)",
639 tier_name, footprint, vram_budget_bytes_);
640 last_residency_error_ = ENTROPIC_ERROR_TIER_MODEL_TOO_LARGE;
641 return false;
642 }
643 return true;
644}
645
665GenerationResult ModelOrchestrator::build_no_model_error(
666 const std::string& tier_name) {
667 GenerationResult err;
668 err.finish_reason = "error";
669 if (last_residency_error_ != ENTROPIC_OK) {
670 err.error_code = last_residency_error_;
671 err.error_message = "Tier '" + tier_name + "' model exceeds the "
672 "engine's VRAM budget (gh#57)";
673 last_residency_error_ = ENTROPIC_OK;
674 } else {
675 err.error_code = ENTROPIC_ERROR_GENERATE_FAILED;
676 err.error_message = "No model available for tier: " + tier_name;
677 }
678 return err;
679}
680
694InferenceBackend* ModelOrchestrator::activate_and_track(
695 const std::string& tier_name,
696 const std::shared_ptr<InferenceBackend>& backend) {
697 auto tier_it = config_.models.tiers.find(tier_name);
698 bool activated = tier_it != config_.models.tiers.end()
699 && backend->load_and_activate(tier_it->second);
700 if (!activated) {
701 logger->error("Failed to activate tier: {}", tier_name);
702 return nullptr;
703 }
704 loaded_main_tier_ = tier_name;
705 last_routing_result_.swap_action = "loaded";
706 auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
707 std::chrono::steady_clock::now() - start_time_).count();
708 tier_last_activation_ms_[tier_name] = now_ms;
709 size_t footprint = tier_footprint_bytes_.count(tier_name)
710 ? tier_footprint_bytes_[tier_name] : 0;
711 fire_residency_observer(ResidencyEvent::Loaded,
712 tier_name, tier_it->second.path.string(),
713 footprint);
714 return backend.get();
715}
716
730InferenceBackend* ModelOrchestrator::get_model(const std::string& tier_name) {
731 std::lock_guard<std::mutex> lock(swap_mutex_);
732
733 auto it = tiers_.find(tier_name);
734 std::string effective_tier = tier_name;
735 if (it == tiers_.end()) {
736 it = tiers_.find(config_.routing.fallback_tier);
737 if (it != tiers_.end()) {
738 effective_tier = config_.routing.fallback_tier;
739 }
740 }
741
742 InferenceBackend* result = nullptr;
743 if (it != tiers_.end() && it->second->is_active()) {
744 last_routing_result_.swap_action = "reused";
745 record_activation_reuse(effective_tier);
746 result = it->second.get();
747 } else if (it != tiers_.end() && residency_admits(effective_tier)) {
748 deactivate_current_if_needed(it->second.get());
749 result = activate_and_track(effective_tier, it->second);
750 }
751
752 // Ensure correct LoRA adapter for this tier (v1.9.2)
753 if (result) {
754 ensure_tier_lora(tier_name, result);
755 }
756
757 return result;
758}
759
767void ModelOrchestrator::ensure_tier_lora(const std::string& tier_name,
768 InferenceBackend* result) {
769 auto* llama_backend = dynamic_cast<LlamaCppBackend*>(result);
770 llama_context* ctx = llama_backend
771 ? llama_backend->llama_context_ptr() : nullptr;
772 double adapter_ms = ensure_adapter_for_tier(tier_name, ctx);
773 last_routing_result_.adapter_swap_ms = adapter_ms;
774 last_routing_result_.adapter_name = lora_manager_.active_adapter();
775}
776
788void ModelOrchestrator::deactivate_current_if_needed(InferenceBackend* incoming) {
789 auto it = loaded_main_tier_.empty()
790 ? tiers_.end() : tiers_.find(loaded_main_tier_);
791
792 bool should_swap = it != tiers_.end()
793 && it->second.get() != incoming
794 && it->second->is_loaded();
795
796 if (!should_swap) {
797 return;
798 }
799
800 // Cascade: unload adapters for this base model (v1.9.2)
801 auto* llama_backend = dynamic_cast<LlamaCppBackend*>(it->second.get());
802 if (llama_backend) {
803 lora_manager_.unload_all_for_model(
804 llama_backend->llama_model_ptr(),
805 llama_backend->llama_context_ptr());
806 }
807
808 unload_or_warm_current(it->second.get());
809}
810
817void ModelOrchestrator::unload_or_warm_current(InferenceBackend* current) {
818 auto cfg_it = config_.models.tiers.find(loaded_main_tier_);
819 bool keep_warm = cfg_it != config_.models.tiers.end()
820 && cfg_it->second.keep_warm;
821
822 if (keep_warm) {
823 logger->info("Deactivating {} (keep_warm=true)", loaded_main_tier_);
824 current->deactivate();
825 return;
826 }
827 logger->info("Unloading {} (keep_warm=false)", loaded_main_tier_);
828 std::string path = cfg_it != config_.models.tiers.end()
829 ? cfg_it->second.path.string() : "";
830 size_t footprint = tier_footprint_bytes_.count(loaded_main_tier_)
831 ? tier_footprint_bytes_[loaded_main_tier_] : 0;
832 std::string evicted_tier = loaded_main_tier_;
833 current->unload();
834 fire_residency_observer(ResidencyEvent::Evicted,
835 evicted_tier, path, footprint);
836}
837
838// ── Queries ────────────────────────────────────────────────
839
846 return last_routing_result_;
847}
848
855 return loaded_main_tier_;
856}
857
867std::vector<std::string> ModelOrchestrator::loaded_models() const {
868 std::vector<std::string> result;
869 for (const auto& [name, backend] : tiers_) {
870 if (backend->is_loaded()) {
871 result.push_back(name);
872 }
873 }
874 if (secondary_loader_.is_loaded("router")) {
875 result.push_back("router");
876 }
877 return result;
878}
879
885std::vector<std::string> ModelOrchestrator::available_models() const {
886 std::vector<std::string> result;
887 for (const auto& [name, _] : tiers_) {
888 result.push_back(name);
889 }
890 if (config_.models.router.has_value()) {
891 result.push_back("router");
892 }
893 return result;
894}
895
904 const std::string& tier_name) const {
905 auto it = tiers_.find(tier_name);
906 if (it == tiers_.end()) { return nullptr; }
907 return it->second.get();
908}
909
916 const std::string& from, const std::string& to) const
917{
918 auto it = handoff_rules_.find(from);
919 if (it == handoff_rules_.end()) {
920 return false;
921 }
922 return it->second.count(to) > 0;
923}
924
930ChatAdapter* ModelOrchestrator::get_adapter(const std::string& tier_name) const {
931 auto it = adapters_.find(tier_name);
932 if (it != adapters_.end()) {
933 return it->second.get();
934 }
935 return nullptr;
936}
937
938// ── LoRA adapter management (v1.9.2) ──────────────────────
939
960bool ModelOrchestrator::deactivate_if_active(llama_context* ctx) {
961 if (lora_manager_.active_adapter().empty()) {
962 return false;
963 }
964 lora_manager_.deactivate(ctx);
965 return true;
966}
967
976double ModelOrchestrator::ensure_adapter_for_tier(
977 const std::string& tier_name, llama_context* ctx)
978{
979 auto tier_it = config_.models.tiers.find(tier_name);
980 if (tier_it == config_.models.tiers.end()) {
981 return 0.0;
982 }
983
984 const auto& tier_cfg = tier_it->second;
985 auto t_start = now();
986 bool needs_kv_clear = false;
987
988 if (!tier_cfg.adapter_path) {
989 needs_kv_clear = deactivate_if_active(ctx);
990 } else if (lora_manager_.active_adapter() != tier_name) {
991 needs_kv_clear = lora_manager_.swap(tier_name, ctx);
992 if (!needs_kv_clear) {
993 logger->warn("Adapter swap to '{}' failed", tier_name);
994 }
995 }
996
997 if (needs_kv_clear && ctx) {
998 llama_memory_clear(llama_get_memory(ctx), true);
999 logger->info("Adapter swap for tier '{}' in {:.1f}ms",
1000 tier_name, elapsed_ms(t_start, now()));
1001 }
1002
1003 return elapsed_ms(t_start, now());
1004}
1005
1015void ModelOrchestrator::preload_adapters() {
1016 int loaded = 0;
1017
1018 for (const auto& [name, tier_cfg] : config_.models.tiers) {
1019 if (!tier_cfg.adapter_path) {
1020 continue;
1021 }
1022
1023 auto tier_it = tiers_.find(name);
1024 if (tier_it == tiers_.end()) {
1025 continue;
1026 }
1027
1028 auto* llama_backend = dynamic_cast<LlamaCppBackend*>(
1029 tier_it->second.get());
1030 if (!llama_backend || !llama_backend->llama_model_ptr()) {
1031 logger->warn("Cannot preload adapter for '{}' — model not loaded",
1032 name);
1033 continue;
1034 }
1035
1036 bool ok = lora_manager_.load(
1037 name,
1038 *tier_cfg.adapter_path,
1039 llama_backend->llama_model_ptr(),
1040 tier_cfg.adapter_scale);
1041
1042 if (ok) {
1043 ++loaded;
1044 }
1045 }
1046
1047 if (loaded > 0) {
1048 logger->info("Preloaded {} LoRA adapter(s) to WARM", loaded);
1049 }
1050}
1051
1052// ── Grammar registry (v1.9.3) ──────────────────────────────
1053
1063void ModelOrchestrator::load_bundled_grammars() {
1064 std::filesystem::path grammar_dir;
1065 if (!config_.config_dir.empty()) {
1066 grammar_dir = config_.config_dir / "grammars";
1067 }
1068 if (grammar_dir.empty() || !std::filesystem::is_directory(grammar_dir)) {
1069 // Fallback set by facade via load_grammars_from() if config_dir
1070 // doesn't have a grammars subdir. Check if already loaded.
1071 logger->info("No bundled grammar directory found, skipping");
1072 return;
1073 }
1074
1075 size_t count = grammar_registry_.load_bundled(grammar_dir);
1076 logger->info("Grammar registry: {} grammar(s) loaded from {}",
1077 count, grammar_dir.string());
1078}
1079
1093 const std::filesystem::path& grammar_dir) {
1094 if (!std::filesystem::is_directory(grammar_dir)) {
1095 return 0;
1096 }
1097 auto count = grammar_registry_.load_bundled(grammar_dir);
1098 logger->info("Grammar registry: {} grammar(s) loaded from {}",
1099 count, grammar_dir.string());
1100 return count;
1101}
1102
1114 for (auto& [_, backend] : model_pool_) {
1115 if (backend) { backend->clear_prompt_cache(); }
1116 }
1117 secondary_loader_.clear_all_prompt_caches();
1118 logger->info("Prompt caches invalidated across all backends "
1119 "(identity change)");
1120}
1121
1129 for (const auto& [_, tier] : config_.models.tiers) {
1130 if (tier.has_capability("vision")) { return true; }
1131 }
1132 return false;
1133}
1134
1142 for (const auto& [name, tier] : config_.models.tiers) {
1143 if (tier.has_capability("vision")) { return name; }
1144 }
1145 return "";
1146}
1147
1156static llama_model* resolve_target_model(
1157 const std::shared_ptr<InferenceBackend>& tier_backend) {
1158 if (!tier_backend || !tier_backend->is_loaded()) {
1159 return nullptr;
1160 }
1161 auto* llama_be = dynamic_cast<LlamaCppBackend*>(tier_backend.get());
1162 return (llama_be == nullptr) ? nullptr : llama_be->llama_model_ptr();
1163}
1164
1175std::string ModelOrchestrator::resolve_speculative_pair(
1176 llama_model*& target_out, llama_model*& draft_out) const {
1177 target_out = nullptr;
1178 draft_out = nullptr;
1179 std::string err;
1180
1181 auto tier_it = tiers_.find(loaded_main_tier_);
1182 if (tier_it == tiers_.end()) {
1183 err = "no main tier loaded";
1184 } else {
1185 target_out = resolve_target_model(tier_it->second);
1186 if (target_out == nullptr) {
1187 err = "main tier backend is not a llama.cpp backend or "
1188 "is not loaded";
1189 } else {
1190 auto* draft_backend = secondary_loader_.get("draft");
1191 if (draft_backend == nullptr || !draft_backend->is_loaded()) {
1192 err = "no draft model configured for speculative "
1193 "decoding "
1194 "(set inference.speculative.draft_model)";
1195 } else {
1196 auto* d = dynamic_cast<LlamaCppBackend*>(draft_backend);
1197 draft_out = (d == nullptr) ? nullptr : d->llama_model_ptr();
1198 if (draft_out == nullptr) {
1199 err = "draft backend is not a llama.cpp backend";
1200 }
1201 }
1202 }
1203 }
1204 return err;
1205}
1206
1218ModelOrchestrator::SpeculativeCompatInfo
1221 llama_model* target_model = nullptr;
1222 llama_model* draft_model = nullptr;
1223 info.diagnostic = resolve_speculative_pair(target_model, draft_model);
1224 if (info.diagnostic.empty()) {
1225 auto result = entropic::speculative::check_compat(
1226 target_model, draft_model);
1227 info.compatible = result.compatible;
1228 info.diagnostic = std::move(result.diagnostic);
1229 }
1230 return info;
1231}
1232
1244static std::string normalize_grammar_key(const std::string& grammar_value) {
1245 std::filesystem::path p(grammar_value);
1246 if (p.extension() == ".gbnf") {
1247 return p.stem().string();
1248 }
1249 return grammar_value;
1250}
1251
1266void ModelOrchestrator::resolve_grammar_key(
1267 GenerationParams& params, const std::string& tier_name)
1268{
1269 if (!params.grammar.empty()) {
1270 return;
1271 }
1272
1273 // Try explicit grammar_key
1274 std::string key = params.grammar_key;
1275
1276 // Fall back to tier config grammar field (frontmatter)
1277 if (key.empty()) {
1278 auto it = config_.models.tiers.find(tier_name);
1279 if (it != config_.models.tiers.end() && it->second.grammar) {
1280 key = normalize_grammar_key(it->second.grammar->string());
1281 }
1282 }
1283
1284 if (key.empty()) {
1285 return;
1286 }
1287
1288 std::string content = grammar_registry_.get(key);
1289 if (content.empty()) {
1290 logger->warn("Grammar key '{}' not found in registry", key);
1291 return;
1292 }
1293
1294 logger->info("Grammar resolved: key='{}', {} bytes",
1295 key, content.size());
1296 params.grammar = std::move(content);
1297}
1298
1299// ── VRAM-aware tier residency (v2.2.4, gh#57) ──────────────
1300
1311size_t ModelOrchestrator::resolve_vram_budget_bytes() {
1312 const char* env = std::getenv("ENTROPIC_VRAM_BUDGET_BYTES");
1313 if (env == nullptr || *env == '\0') { return 0; }
1314 try {
1315 long long v = std::stoll(env);
1316 return (v < 0) ? 0 : static_cast<size_t>(v);
1317 } catch (...) {
1318 return 0;
1319 }
1320}
1321
1332size_t ModelOrchestrator::estimate_footprint_bytes(
1333 const std::string& tier_name) const {
1334 auto tier_it = config_.models.tiers.find(tier_name);
1335 if (tier_it == config_.models.tiers.end()) { return 0; }
1336 const auto& tier_cfg = tier_it->second;
1337 std::error_code ec;
1338 auto weights = std::filesystem::file_size(tier_cfg.path, ec);
1339 if (ec) { return 0; }
1340 const size_t kv_per_token = 16ull * 1024ull;
1341 size_t kv = static_cast<size_t>(tier_cfg.context_length) * kv_per_token;
1342 size_t headroom = static_cast<size_t>(config_.vram_reserve_mb)
1343 * 1024ull * 1024ull;
1344 return static_cast<size_t>(weights) + kv + headroom;
1345}
1346
1353 const std::string& tier_name) const {
1354 std::lock_guard<std::mutex> lock(swap_mutex_);
1355 auto it = tier_footprint_bytes_.find(tier_name);
1356 if (it != tier_footprint_bytes_.end()) { return it->second; }
1357 size_t v = estimate_footprint_bytes(tier_name);
1358 if (v > 0) {
1359 tier_footprint_bytes_[tier_name] = v;
1360 }
1361 return v;
1362}
1363
1370 std::lock_guard<std::mutex> lock(swap_mutex_);
1371 residency_observer_ = std::move(cb);
1372}
1373
1379void ModelOrchestrator::fire_residency_observer(
1380 ResidencyEvent event,
1381 const std::string& tier_name,
1382 const std::string& model_path,
1383 size_t footprint) {
1384 const char* event_name = "unknown";
1385 switch (event) {
1386 case ResidencyEvent::Loaded: event_name = "loaded"; break;
1387 case ResidencyEvent::Evicted: event_name = "evicted"; break;
1388 case ResidencyEvent::ActivationSwap: event_name = "activation_swap"; break;
1389 }
1390 logger->info("[residency] {} tier='{}' path='{}' footprint={} bytes",
1391 event_name, tier_name, model_path, footprint);
1392 if (residency_observer_) {
1393 residency_observer_(event, tier_name, model_path, footprint);
1394 }
1395}
1396
1414static nlohmann::json make_residency_entry(
1415 const std::string& name, const std::filesystem::path& path,
1416 int context_length, size_t footprint, int vram_reserve_mb,
1417 long long last_ms) {
1418 std::error_code ec;
1419 auto weights = std::filesystem::file_size(path, ec);
1420 size_t weights_b = ec ? 0u : static_cast<size_t>(weights);
1421 size_t kv = static_cast<size_t>(context_length) * 16ull * 1024ull;
1422 size_t headroom = static_cast<size_t>(vram_reserve_mb)
1423 * 1024ull * 1024ull;
1424 return {
1425 {"tier", name},
1426 {"model_path", path.string()},
1427 {"footprint_bytes", footprint},
1428 {"weights_bytes", weights_b},
1429 {"kv_cache_bytes", kv},
1430 {"headroom_bytes", headroom},
1431 {"last_activation_ms", last_ms}
1432 };
1433}
1434
1441 std::lock_guard<std::mutex> lock(swap_mutex_);
1442 nlohmann::json j;
1443 j["vram_total_bytes"] = vram_budget_bytes_;
1444 j["vram_budget_bytes"] = vram_budget_bytes_;
1445 size_t in_use = 0;
1446 nlohmann::json arr = nlohmann::json::array();
1447 for (const auto& [name, backend] : tiers_) {
1448 if (!backend || !backend->is_loaded()) { continue; }
1449 auto tier_it = config_.models.tiers.find(name);
1450 if (tier_it == config_.models.tiers.end()) { continue; }
1451 auto fp_it = tier_footprint_bytes_.find(name);
1452 size_t footprint = (fp_it != tier_footprint_bytes_.end())
1453 ? fp_it->second : estimate_footprint_bytes(name);
1454 in_use += footprint;
1455 auto la = tier_last_activation_ms_.find(name);
1456 long long last_ms = (la != tier_last_activation_ms_.end())
1457 ? la->second : 0;
1458 arr.push_back(make_residency_entry(
1459 name, tier_it->second.path, tier_it->second.context_length,
1460 footprint, config_.vram_reserve_mb, last_ms));
1461 }
1462 j["residency"] = std::move(arr);
1463 j["vram_headroom_bytes"] = vram_budget_bytes_ > in_use
1464 ? vram_budget_bytes_ - in_use
1465 : 0u;
1466 j["backend"] = vram_budget_bytes_ > 0 ? "configured" : "unknown";
1467 return j.dump();
1468}
1469
1470} // namespace entropic
Adapter factory — create adapters by name.
bool swap(const std::string &name, llama_context *ctx)
Swap to a different adapter atomically.
std::string active_adapter() const
Get the currently HOT adapter name.
void unload_all_for_model(llama_model *model, llama_context *ctx)
Unload all adapters for a given base model.
void deactivate(llama_context *ctx)
Deactivate current HOT adapter (HOT -> WARM).
bool load(const std::string &name, const std::filesystem::path &adapter_path, llama_model *model, float scale=1.0f)
Load a LoRA adapter into RAM (COLD -> WARM).
void unload_all()
Free every loaded adapter handle (gh#58 close-out, v2.3.0).
Concrete base class for chat format adapters (80% logic).
virtual ParseResult parse_tool_calls(const std::string &content) const =0
Parse tool calls from model output.
size_t load_bundled(const std::filesystem::path &grammar_dir)
Load all bundled grammars from a directory.
std::string get(const std::string &key) const
Get GBNF content string for a grammar key.
Concrete base class for inference backends (80% logic).
Definition backend.h:69
BackendInfo info() const
Get backend metadata.
Definition backend.cpp:486
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams &params)
Generate a complete response.
Definition backend.cpp:182
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view token)> on_token, std::atomic< bool > &cancel)
Generate with per-token streaming callback.
Definition backend.cpp:211
GenerationResult complete(const std::string &prompt, const GenerationParams &params)
Raw text completion without chat template.
Definition backend.cpp:308
LlamaCppBackend — common llama.cpp patterns (15% layer).
llama_model * llama_model_ptr()
Get the loaded llama_model pointer.
SpeculativeCompatInfo check_speculative_compat() const
Check whether the currently-configured target/draft pair is compatible for speculative decoding.
std::vector< std::string > available_models() const
All configured tier names.
size_t load_grammars_from(const std::filesystem::path &grammar_dir)
Load grammars from an explicit directory path.
GenerationResult generate_streaming(const std::vector< Message > &messages, const GenerationParams &params, std::function< void(std::string_view)> on_token, std::atomic< bool > &cancel, const std::string &tier_name="")
Streaming generation.
std::vector< std::string > loaded_models() const
Currently loaded model tier names.
bool initialize(const ParsedConfig &config)
Initialize from parsed config.
bool has_vision_capable_tier() const
Return true if any configured tier declares the "vision" capability (gh#41, v2.1.8).
size_t tier_footprint_bytes(const std::string &tier_name) const
Estimated VRAM footprint for a given tier in bytes.
void shutdown()
Shutdown — unload all models.
RoutingResult last_routing_result() const
Last routing result.
std::function< void(ResidencyEvent event, const std::string &tier_name, const std::string &model_path, size_t footprint)> ResidencyObserverFn
Residency observer callback type (internal C++ form).
GenerationResult generate(const std::vector< Message > &messages, const GenerationParams &params, const std::string &tier_name="")
Generate using routed or explicit tier.
void clear_all_prompt_caches()
Invalidate prompt/KV caches across every pooled backend.
std::string route(const std::vector< Message > &messages)
Route to tier using router model.
ChatAdapter * get_adapter(const std::string &tier_name) const
Get adapter for a tier.
void set_residency_observer(ResidencyObserverFn cb)
Register a residency observer.
std::string last_used_tier() const
Last used tier name.
~ModelOrchestrator()
Destructor — invokes shutdown() and AdapterManager::unload_all().
std::string select_vision_tier() const
Pick the canonical vision-capable tier name (gh#41).
bool can_handoff(const std::string &from, const std::string &to) const
Check if handoff is permitted.
std::string residency_snapshot_json() const
Serialize the current residency set as a JSON string.
InferenceBackend * get_backend(const std::string &tier_name) const
Get the inference backend for a tier (for evaluation APIs).
void clear_all_prompt_caches()
Fanout: clear prompt/KV cache on every loaded backend.
bool is_loaded(const std::string &role) const
Check whether a role is currently loaded and active.
InferenceBackend * get(const std::string &role) const
Get the backend for a role.
bool ensure_loaded(const std::string &role, const ModelConfig &config)
Lazily load and activate a model for a role.
@ ENTROPIC_OK
Success.
Definition error.h:36
@ ENTROPIC_ERROR_TIER_MODEL_TOO_LARGE
A single tier's model weights+KV exceed the engine's VRAM budget; eviction cannot help (v2....
Definition error.h:89
@ ENTROPIC_ERROR_NOT_SUPPORTED
Capability not supported by this backend (v1.9.13)
Definition error.h:84
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
Definition error.h:42
Pure C interface contract for inference backends.
void entropic_inference_log_to_file(const char *path)
Redirect llama/ggml logs to a file.
LlamaCppBackend — llama.cpp C API integration.
spdlog initialization and logger access.
auto now()
Get current time for timing measurements.
Definition logging.h:193
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211
double elapsed_ms(std::chrono::steady_clock::time_point start, std::chrono::steady_clock::time_point end)
Compute elapsed milliseconds between two time points.
Definition logging.h:203
Activate model on GPU (WARM → ACTIVE).
static nlohmann::json make_residency_entry(const std::string &name, const std::filesystem::path &path, int context_length, size_t footprint, int vram_reserve_mb, long long last_ms)
JSON serialization of the current residency set.
@ ok
Tool dispatched, returned non-empty content.
static void log_orchestration(const GenerationResult &result, const std::string &selected, const std::string &adapter_name, const GenerationParams &params, double routing_ms, double swap_ms)
Log the per-orchestration tier/adapter/timing summary.
static llama_model * resolve_target_model(const std::shared_ptr< InferenceBackend > &tier_backend)
Resolve the active main-tier llama_model* for compat lookup.
std::unique_ptr< ChatAdapter > create_adapter(const std::string &name, const std::string &tier_name, const std::string &identity_prompt)
Create adapter by name.
static std::string normalize_grammar_key(const std::string &grammar_value)
Normalize a frontmatter grammar value to a registry key.
static void apply_adapter_parse(ChatAdapter *adapter, GenerationResult &result)
Run the tier's adapter over a result to split tool calls.
ModelOrchestrator — multi-model lifecycle and routing.
Tokenizer/architecture compatibility check for speculative decoding draft pairing.
Generation parameters for a single inference call.
Definition config.h:227
std::string grammar
GBNF grammar string (empty = unconstrained)
Definition config.h:240
float temperature
Sampling temperature.
Definition config.h:228
std::string grammar_key
Grammar registry key.
Definition config.h:245
int max_tokens
Maximum tokens to generate.
Definition config.h:232
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
double swap_ms
Model swap time.
double routing_ms
Router classification time.
double generation_time_ms
Wall-clock generation time.
std::string raw_content
Raw model output before adapter processing.
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string content
Generated text (cleaned by adapter)
std::vector< ToolCall > tool_calls
Tool calls parsed from content.
std::string error_message
Error description (empty if no error)
double total_ms
Total end-to-end time.
SpeculativeConfig speculative
Speculative decoding (gh#36)
Definition config.h:703
std::filesystem::path path
Resolved model file path.
Definition config.h:149
Result of a speculative-decoding compatibility check.
std::optional< ModelConfig > router
Router model (separate from tiers)
Definition config.h:358
std::unordered_map< std::string, TierConfig > tiers
Tier name → config.
Definition config.h:357
std::string default_tier
Default tier name.
Definition config.h:359
Full parsed configuration.
Definition config.h:714
int vram_reserve_mb
Reserved VRAM headroom (MB, 0–65536)
Definition config.h:735
RoutingConfig routing
Routing rules.
Definition config.h:716
InferenceConfig inference
Inference-side knobs (currently speculative decoding only).
Definition config.h:760
ModelsConfig models
Tiers + router.
Definition config.h:715
std::filesystem::path log_dir
Session log directory (session.log + session_model.log).
Definition config.h:742
bool ggml_logging
Enable ggml/llama.cpp logging to llama_ggml.log in log_dir.
Definition config.h:746
std::filesystem::path config_dir
Config dir — base for bundled data discovery.
Definition config.h:738
std::string fallback_tier
Fallback when routing fails.
Definition config.h:390
bool enabled
Enable routing.
Definition config.h:389
Result metadata from a routing decision.
std::string adapter_name
Active adapter (empty = base model) (v1.9.2)
std::string swap_action
"none", "reused", "loaded"
double adapter_swap_ms
Adapter swap latency (v1.9.2)
bool enabled
Master switch (off by default)
Definition config.h:673
int n_draft
Window size (proposed tokens)
Definition config.h:674
ModelConfig draft
Full ModelConfig for the draft model.
Definition config.h:690