Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
adapter_manager.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
17
18#include <llama.h>
19#include <nlohmann/json.hpp>
20
21#include <cstring>
22
23namespace entropic {
24
25namespace {
26auto logger = entropic::log::get("inference.adapter_manager");
29
38void apply_adapter(llama_context* ctx,
39 llama_adapter_lora* adapter, float scale)
40{
41 llama_set_adapters_lora(ctx, &adapter, 1, &scale);
42}
43
50void clear_adapters(llama_context* ctx) {
51 llama_set_adapters_lora(ctx, nullptr, 0, nullptr);
52}
53
54} // anonymous namespace
55
56// ── Load ────────────────────────────────────────────────────
57
74 const std::string& name,
75 const std::filesystem::path& adapter_path,
76 llama_model* model,
77 float scale)
78{
79 std::lock_guard<std::mutex> lock(adapter_mutex_);
80
81 bool precondition_failed =
82 adapters_.find(name) != adapters_.end() || !model;
83 if (precondition_failed) {
84 logger->error("Cannot load adapter '{}': {}",
85 name, !model ? "base model is null" : "duplicate name");
86 return false;
87 }
88
89 auto t_start = now();
90 auto* lora = llama_adapter_lora_init(model, adapter_path.c_str());
91 if (!lora) {
92 logger->error("llama_adapter_lora_init failed for '{}' at {}",
93 name, adapter_path.string());
94 return false;
95 }
96
97 AdapterEntry entry;
98 entry.name = name;
99 entry.path = adapter_path;
100 entry.handle = lora;
101 entry.model = model;
102 entry.scale = scale;
103 entry.state = AdapterState::WARM;
104 adapters_.emplace(name, std::move(entry));
105
106 logger->info("Loaded adapter '{}' from {} in {:.1f}ms (scale={:.2f})",
107 name, adapter_path.string(),
108 elapsed_ms(t_start, now()), scale);
109 return true;
110}
111
112// ── Unload ──────────────────────────────────────────────────
113
125void AdapterManager::unload(const std::string& name, llama_context* ctx) {
126 std::lock_guard<std::mutex> lock(adapter_mutex_);
127
128 auto it = adapters_.find(name);
129 if (it == adapters_.end()) {
130 return;
131 }
132
133 auto& entry = it->second;
134 if (entry.state == AdapterState::HOT && ctx) {
135 clear_adapters(ctx);
136 active_name_.clear();
137 }
138
139 if (entry.handle) {
140 llama_adapter_lora_free(entry.handle);
141 entry.handle = nullptr;
142 }
143
144 entry.state = AdapterState::COLD;
145 logger->info("Unloaded adapter '{}'", name);
146
147 adapters_.erase(it);
148}
149
150// ── Activate ────────────────────────────────────────────────
151
164bool AdapterManager::activate(const std::string& name, llama_context* ctx) {
165 std::lock_guard<std::mutex> lock(adapter_mutex_);
166
167 auto it = adapters_.find(name);
168 bool cannot_activate =
169 it == adapters_.end() || it->second.state == AdapterState::COLD;
170 if (it != adapters_.end() && it->second.state == AdapterState::HOT) {
171 return true; // Already active — no-op
172 }
173 if (cannot_activate) {
174 logger->error("Cannot activate adapter '{}': {}",
175 name, it == adapters_.end() ? "not found" : "state is COLD");
176 return false;
177 }
178
179 auto& entry = it->second;
180
181 // Mark previous HOT adapter as WARM
182 if (!active_name_.empty() && active_name_ != name) {
183 auto active_it = adapters_.find(active_name_);
184 if (active_it != adapters_.end()) {
185 active_it->second.state = AdapterState::WARM;
186 }
187 }
188
189 apply_adapter(ctx, entry.handle, entry.scale);
190 entry.state = AdapterState::HOT;
191 active_name_ = name;
192
193 logger->info("Activated adapter '{}' (scale={:.2f})", name, entry.scale);
194 return true;
195}
196
197// ── Deactivate ──────────────────────────────────────────────
198
208void AdapterManager::deactivate(llama_context* ctx) {
209 std::lock_guard<std::mutex> lock(adapter_mutex_);
210
211 if (active_name_.empty()) {
212 return;
213 }
214
215 auto it = adapters_.find(active_name_);
216 if (it != adapters_.end()) {
217 if (ctx) {
218 clear_adapters(ctx);
219 }
220 it->second.state = AdapterState::WARM;
221 }
222
223 logger->info("Deactivated adapter '{}'", active_name_);
224 active_name_.clear();
225}
226
227// ── Swap ────────────────────────────────────────────────────
228
241bool AdapterManager::swap(const std::string& name, llama_context* ctx) {
242 std::lock_guard<std::mutex> lock(adapter_mutex_);
243
244 auto it = adapters_.find(name);
245 bool cannot_swap = it == adapters_.end()
246 || it->second.state == AdapterState::COLD;
247 if (active_name_ == name && !cannot_swap) {
248 return true; // Already active — no-op
249 }
250 if (cannot_swap || !fire_swap_hook(active_name_, name, it->second.path)) {
251 logger->error("Cannot swap to adapter '{}': {}",
252 name, cannot_swap ? "not found or COLD" : "cancelled by hook");
253 return false;
254 }
255
256 auto t_start = now();
257
258 // Mark current HOT as WARM
259 if (!active_name_.empty()) {
260 auto active_it = adapters_.find(active_name_);
261 if (active_it != adapters_.end()) {
262 active_it->second.state = AdapterState::WARM;
263 }
264 }
265
266 // Apply target
267 auto& target = it->second;
268 apply_adapter(ctx, target.handle, target.scale);
269 target.state = AdapterState::HOT;
270
271 std::string previous = active_name_;
272 active_name_ = name;
273
274 logger->info("Swapped adapter '{}' -> '{}' in {:.1f}ms",
275 previous, name, elapsed_ms(t_start, now()));
276 return true;
277}
278
279// ── Unload All ──────────────────────────────────────────────
280
293 llama_model* model, llama_context* ctx)
294{
295 std::lock_guard<std::mutex> lock(adapter_mutex_);
296
297 bool cleared_context = false;
298 std::vector<std::string> to_remove;
299
300 for (auto& [name, entry] : adapters_) {
301 if (entry.model != model) {
302 continue;
303 }
304
305 if (entry.state == AdapterState::HOT
306 && ctx && !cleared_context)
307 {
308 clear_adapters(ctx);
309 cleared_context = true;
310 }
311
312 if (entry.handle) {
313 llama_adapter_lora_free(entry.handle);
314 entry.handle = nullptr;
315 }
316
317 entry.state = AdapterState::COLD;
318 to_remove.push_back(name);
319 }
320
321 for (const auto& name : to_remove) {
322 if (name == active_name_) {
323 active_name_.clear();
324 }
325 adapters_.erase(name);
326 }
327
328 logger->info("Unloaded {} adapter(s) for model", to_remove.size());
329}
330
344 std::lock_guard<std::mutex> lock(adapter_mutex_);
345 size_t freed = 0;
346 for (auto& [name, entry] : adapters_) {
347 if (entry.handle) {
348 llama_adapter_lora_free(entry.handle);
349 entry.handle = nullptr;
350 ++freed;
351 }
352 entry.state = AdapterState::COLD;
353 }
354 adapters_.clear();
355 active_name_.clear();
356 if (freed > 0) {
357 logger->info("Unloaded all {} adapter(s) on shutdown", freed);
358 }
359}
360
361// ── Queries ─────────────────────────────────────────────────
362
370AdapterState AdapterManager::state(const std::string& name) const {
371 std::lock_guard<std::mutex> lock(adapter_mutex_);
372 auto it = adapters_.find(name);
373 if (it == adapters_.end()) {
374 return AdapterState::COLD;
375 }
376 return it->second.state;
377}
378
386AdapterInfo AdapterManager::info(const std::string& name) const {
387 std::lock_guard<std::mutex> lock(adapter_mutex_);
388 auto it = adapters_.find(name);
389 if (it == adapters_.end()) {
390 return {};
391 }
392 return make_info(it->second);
393}
394
401std::vector<AdapterInfo> AdapterManager::list_adapters() const {
402 std::lock_guard<std::mutex> lock(adapter_mutex_);
403 std::vector<AdapterInfo> result;
404 result.reserve(adapters_.size());
405 for (const auto& [name, entry] : adapters_) {
406 result.push_back(make_info(entry));
407 }
408 return result;
409}
410
418 std::lock_guard<std::mutex> lock(adapter_mutex_);
419 return active_name_;
420}
421
428void AdapterManager::set_hook_interface(const HookInterface& hooks) {
429 hooks_ = hooks;
430}
431
432// ── Private ─────────────────────────────────────────────────
433
441AdapterInfo AdapterManager::make_info(const AdapterEntry& entry) {
443 info.name = entry.name;
444 info.path = entry.path;
445 info.state = entry.state;
446 info.scale = entry.scale;
447 info.tier_name = entry.tier_name;
448 info.ram_bytes = entry.ram_bytes;
449 info.metadata = entry.metadata;
450 return info;
451}
452
466bool AdapterManager::fire_swap_hook(
467 const std::string& current,
468 const std::string& target,
469 const std::filesystem::path& target_path)
470{
471 if (!hooks_.fire_pre || !hooks_.registry) {
472 return true; // No hook registered — proceed
473 }
474
475 nlohmann::json ctx;
476 ctx["current_adapter"] = current;
477 ctx["target_adapter"] = target;
478 ctx["adapter_path"] = target_path.string();
479 std::string ctx_str = ctx.dump();
480
481 char* modified = nullptr;
482 int rc = hooks_.fire_pre(
483 hooks_.registry,
485 ctx_str.c_str(),
486 &modified);
487
488 if (modified) {
489 free(modified);
490 }
491
492 return rc == 0;
493}
494
495} // namespace entropic
AdapterManager — LoRA adapter lifecycle and hot-swap.
bool swap(const std::string &name, llama_context *ctx)
Swap to a different adapter atomically.
std::vector< AdapterInfo > list_adapters() const
List all known adapters.
bool activate(const std::string &name, llama_context *ctx)
Activate adapter on context (WARM -> HOT).
std::string active_adapter() const
Get the currently HOT adapter name.
void unload_all_for_model(llama_model *model, llama_context *ctx)
Unload all adapters for a given base model.
void deactivate(llama_context *ctx)
Deactivate current HOT adapter (HOT -> WARM).
bool load(const std::string &name, const std::filesystem::path &adapter_path, llama_model *model, float scale=1.0f)
Load a LoRA adapter into RAM (COLD -> WARM).
void unload(const std::string &name, llama_context *ctx)
Unload adapter (any state -> COLD).
AdapterInfo info(const std::string &name) const
Get metadata for an adapter.
void unload_all()
Free every loaded adapter handle (gh#58 close-out, v2.3.0).
AdapterState state(const std::string &name) const
Get adapter state.
void set_hook_interface(const HookInterface &hooks)
Set hook interface for ON_ADAPTER_SWAP dispatch.
@ ENTROPIC_HOOK_ON_ADAPTER_SWAP
16: Adapter/LoRA swap requested
Definition hooks.h:54
spdlog initialization and logger access.
auto now()
Get current time for timing measurements.
Definition logging.h:193
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211
double elapsed_ms(std::chrono::steady_clock::time_point start, std::chrono::steady_clock::time_point end)
Compute elapsed milliseconds between two time points.
Definition logging.h:203
Activate model on GPU (WARM → ACTIVE).
AdapterState
LoRA adapter lifecycle state.
Definition config.h:112
@ WARM
Loaded in RAM via llama_adapter_lora_init(). Ready to activate.
@ COLD
Not loaded. No resources consumed.
@ HOT
Active on context via llama_set_adapter_lora(). Influencing generation.
Metadata for a loaded LoRA adapter.
Definition config.h:126
std::string tier_name
Tier this adapter is assigned to (empty = unassigned)
Definition config.h:131
size_t ram_bytes
RAM consumption when WARM/HOT (0 if COLD)
Definition config.h:133
std::filesystem::path path
Resolved path to .gguf adapter file.
Definition config.h:128
AdapterState state
Current lifecycle state.
Definition config.h:129
std::string name
Unique adapter identifier.
Definition config.h:127
std::unordered_map< std::string, std::string > metadata
Adapter-specific metadata for routing decisions.
Definition config.h:136
float scale
LoRA scaling factor (alpha/rank)
Definition config.h:130