Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
prompt_cache.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
13#include "prompt_cache.h"
14
16
17#include <algorithm>
18
19namespace entropic {
20
21namespace {
22
23auto logger = entropic::log::get("inference.prompt_cache");
24
33uint64_t fnv1a_64(const char* data, size_t len) {
34 uint64_t h = 14695981039346656037ULL;
35 for (size_t i = 0; i < len; ++i) {
36 h ^= static_cast<uint64_t>(static_cast<uint8_t>(data[i]));
37 h *= 1099511628211ULL;
38 }
39 return h;
40}
41
42} // anonymous namespace
43
49PromptCache::PromptCache(size_t max_bytes)
50 : max_bytes_(max_bytes)
51 , bytes_used_(0)
52{
53}
54
67CacheKey PromptCache::make_key(std::string_view prompt_text,
68 std::string_view model_path)
69{
70 std::string combined;
71 combined.reserve(prompt_text.size() + 1 + model_path.size());
72 combined.append(prompt_text);
73 combined.push_back('\0');
74 combined.append(model_path);
75 return CacheKey{fnv1a_64(combined.data(), combined.size())};
76}
77
92 std::vector<uint8_t>&& data,
93 int token_count)
94{
95 std::lock_guard<std::mutex> lock(mutex_);
96
97 size_t entry_size = data.size();
98
99 if (max_bytes_ == 0) {
100 return false;
101 }
102
103 if (entry_size > max_bytes_) {
104 logger->warn("Cache entry ({} bytes) exceeds max_bytes ({}), skipping",
105 entry_size, max_bytes_);
106 return false;
107 }
108
109 // Remove existing entry with same key if present
110 auto it = entries_.find(key);
111 if (it != entries_.end()) {
112 bytes_used_ -= it->second.data_size;
113 entries_.erase(it);
114 auto lru_it = lru_map_.find(key);
115 if (lru_it != lru_map_.end()) {
116 lru_.erase(lru_it->second);
117 lru_map_.erase(lru_it);
118 }
119 }
120
121 evict_until(entry_size);
122
123 CacheEntry entry;
124 entry.data = std::move(data);
125 entry.token_count = token_count;
126 entry.data_size = entry_size;
127
128 entries_[key] = std::move(entry);
129 lru_.push_front(key);
130 lru_map_[key] = lru_.begin();
131
132 bytes_used_ += entry_size;
133 ++stats_.stores;
134 if (bytes_used_ > stats_.peak_bytes) {
135 stats_.peak_bytes = bytes_used_;
136 }
137
138 logger->info("Cached prompt prefix: {} bytes, {} tokens, {} entries total",
139 entry_size, token_count, entries_.size());
140 return true;
141}
142
155 std::lock_guard<std::mutex> lock(mutex_);
156
157 auto it = entries_.find(key);
158 if (it == entries_.end()) {
159 ++stats_.misses;
160 logger->info("Cache MISS: key={:016x}, {} hits / {} misses",
161 key.hash, stats_.hits, stats_.misses);
162 return nullptr;
163 }
164
165 // Move to front of LRU
166 auto lru_it = lru_map_.find(key);
167 if (lru_it != lru_map_.end()) {
168 lru_.splice(lru_.begin(), lru_, lru_it->second);
169 }
170
171 ++stats_.hits;
172 logger->info("Cache HIT: key={:016x}, {} tokens, {} bytes",
173 key.hash, it->second.token_count,
174 it->second.data_size);
175 return &it->second;
176}
177
184 std::lock_guard<std::mutex> lock(mutex_);
185 entries_.clear();
186 lru_.clear();
187 lru_map_.clear();
188 bytes_used_ = 0;
189 logger->info("Prompt cache cleared");
190}
191
199 std::lock_guard<std::mutex> lock(mutex_);
200 return bytes_used_;
201}
202
210 std::lock_guard<std::mutex> lock(mutex_);
211 return entries_.size();
212}
213
221 std::lock_guard<std::mutex> lock(mutex_);
222 return stats_;
223}
224
235void PromptCache::evict_until(size_t needed_bytes) {
236 while (bytes_used_ + needed_bytes > max_bytes_ && !lru_.empty()) {
237 CacheKey victim = lru_.back();
238 lru_.pop_back();
239 lru_map_.erase(victim);
240
241 auto it = entries_.find(victim);
242 if (it != entries_.end()) {
243 bytes_used_ -= it->second.data_size;
244 ++stats_.evictions;
245 logger->info("Evicted cache entry: {} bytes freed",
246 it->second.data_size);
247 entries_.erase(it);
248 }
249 }
250}
251
252} // namespace entropic
PromptCache(size_t max_bytes)
Construct with maximum RAM budget.
CacheStats stats() const
Cache hit/miss statistics.
static CacheKey make_key(std::string_view prompt_text, std::string_view model_path)
Compute a cache key from prompt text and model path.
const CacheEntry * lookup(const CacheKey &key)
Retrieve a cached KV snapshot.
bool store(const CacheKey &key, std::vector< uint8_t > &&data, int token_count)
Store a KV cache snapshot.
size_t entry_count() const
Number of cached entries.
void clear()
Evict all entries.
size_t bytes_used() const
Current total bytes consumed by cached entries.
spdlog initialization and logger access.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211
Activate model on GPU (WARM → ACTIVE).
Host-memory KV cache state storage with LRU eviction.
Single cached KV state snapshot.
std::vector< uint8_t > data
Raw KV cache bytes.
size_t data_size
data.size() for quick byte accounting
int token_count
Prompt tokens covered by this entry.
64-bit hash used as cache lookup key.
Cumulative cache performance counters.
uint64_t hits
Successful lookups.
size_t peak_bytes
High-water mark of bytes_used.
uint64_t evictions
LRU evictions.
uint64_t misses
Failed lookups.
uint64_t stores
Successful stores.