35#include <unordered_map>
94 const std::string& role)
const;
114 bool is_loaded(
const std::string& role)
const;
151 mutable std::mutex slots_mutex_;
156 std::unordered_map<std::string, std::shared_ptr<InferenceBackend>>
161 std::unordered_map<std::string, std::string> slot_paths_;
Concrete base class for inference backends (80% logic).
Role-keyed lifecycle manager for non-primary models.
std::shared_ptr< InferenceBackend > get_shared(const std::string &role) const
Get the backend for a role as a shared_ptr.
void clear_all_prompt_caches()
Fanout: clear prompt/KV cache on every loaded backend.
bool is_loaded(const std::string &role) const
Check whether a role is currently loaded and active.
std::vector< std::string > loaded_roles() const
Names of all roles with a currently-loaded backend.
bool release_role(const std::string &role)
Unload and drop a role.
void shutdown()
Unload every role.
InferenceBackend * get(const std::string &role) const
Get the backend for a role.
bool ensure_loaded(const std::string &role, const ModelConfig &config)
Lazily load and activate a model for a role.
Configuration structs with defaults.
InferenceBackend concrete base class.
Activate model on GPU (WARM → ACTIVE).
Model configuration for a single tier.