35 const std::string& role,
const ModelConfig& config) {
36 std::lock_guard<std::mutex> lock(slots_mutex_);
38 const std::string new_path = config.
path.string();
39 auto path_it = slot_paths_.find(role);
40 if (path_it != slot_paths_.end() && path_it->second == new_path) {
41 auto it = slots_.find(role);
42 if (it != slots_.end() && it->second->is_loaded()) {
47 auto backend = std::make_shared<LlamaCppBackend>();
48 if (!backend->load_and_activate(config)) {
49 logger->error(
"Failed to activate role '{}' from path: {}",
54 slots_[role] = backend;
55 slot_paths_[role] = new_path;
56 logger->info(
"Activated secondary role '{}' from {}", role, new_path);
68 std::lock_guard<std::mutex> lock(slots_mutex_);
69 auto it = slots_.find(role);
70 return (it == slots_.end()) ? nullptr : it->second.get();
81 const std::string& role)
const {
82 std::lock_guard<std::mutex> lock(slots_mutex_);
83 auto it = slots_.find(role);
84 return (it == slots_.end()) ? std::shared_ptr<InferenceBackend>{}
96 std::lock_guard<std::mutex> lock(slots_mutex_);
97 auto it = slots_.find(role);
98 if (it == slots_.end()) {
101 if (it->second->is_loaded()) {
102 it->second->unload();
105 slot_paths_.erase(role);
106 logger->info(
"Released secondary role '{}'", role);
118 std::lock_guard<std::mutex> lock(slots_mutex_);
119 auto it = slots_.find(role);
120 return it != slots_.end() && it->second->is_loaded();
130 std::lock_guard<std::mutex> lock(slots_mutex_);
131 std::vector<std::string> out;
132 out.reserve(slots_.size());
133 for (
const auto& [role, backend] : slots_) {
134 if (backend->is_loaded()) {
138 std::sort(out.begin(), out.end());
148 std::lock_guard<std::mutex> lock(slots_mutex_);
149 for (
auto& [role, backend] : slots_) {
150 backend->clear_prompt_cache();
160 std::lock_guard<std::mutex> lock(slots_mutex_);
161 for (
auto& [role, backend] : slots_) {
162 if (backend->is_loaded()) {
Concrete base class for inference backends (80% logic).
std::shared_ptr< InferenceBackend > get_shared(const std::string &role) const
Get the backend for a role as a shared_ptr.
void clear_all_prompt_caches()
Fanout: clear prompt/KV cache on every loaded backend.
bool is_loaded(const std::string &role) const
Check whether a role is currently loaded and active.
std::vector< std::string > loaded_roles() const
Names of all roles with a currently-loaded backend.
bool release_role(const std::string &role)
Unload and drop a role.
void shutdown()
Unload every role.
InferenceBackend * get(const std::string &role) const
Get the backend for a role.
bool ensure_loaded(const std::string &role, const ModelConfig &config)
Lazily load and activate a model for a role.
LlamaCppBackend — llama.cpp C API integration.
spdlog initialization and logger access.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Activate model on GPU (WARM → ACTIVE).
Unified lifecycle for non-primary inference backends.
Model configuration for a single tier.
std::filesystem::path path
Resolved model file path.