33uint64_t fnv1a_64(
const char* data,
size_t len) {
34 uint64_t h = 14695981039346656037ULL;
35 for (
size_t i = 0; i < len; ++i) {
36 h ^=
static_cast<uint64_t
>(
static_cast<uint8_t
>(data[i]));
37 h *= 1099511628211ULL;
50 : max_bytes_(max_bytes)
68 std::string_view model_path)
71 combined.reserve(prompt_text.size() + 1 + model_path.size());
72 combined.append(prompt_text);
73 combined.push_back(
'\0');
74 combined.append(model_path);
75 return CacheKey{fnv1a_64(combined.data(), combined.size())};
92 std::vector<uint8_t>&& data,
95 std::lock_guard<std::mutex> lock(mutex_);
97 size_t entry_size = data.size();
99 if (max_bytes_ == 0) {
103 if (entry_size > max_bytes_) {
104 logger->warn(
"Cache entry ({} bytes) exceeds max_bytes ({}), skipping",
105 entry_size, max_bytes_);
110 auto it = entries_.find(key);
111 if (it != entries_.end()) {
112 bytes_used_ -= it->second.data_size;
114 auto lru_it = lru_map_.find(key);
115 if (lru_it != lru_map_.end()) {
116 lru_.erase(lru_it->second);
117 lru_map_.erase(lru_it);
121 evict_until(entry_size);
124 entry.
data = std::move(data);
128 entries_[key] = std::move(entry);
129 lru_.push_front(key);
130 lru_map_[key] = lru_.begin();
132 bytes_used_ += entry_size;
138 logger->info(
"Cached prompt prefix: {} bytes, {} tokens, {} entries total",
139 entry_size, token_count, entries_.size());
155 std::lock_guard<std::mutex> lock(mutex_);
157 auto it = entries_.find(key);
158 if (it == entries_.end()) {
160 logger->info(
"Cache MISS: key={:016x}, {} hits / {} misses",
166 auto lru_it = lru_map_.find(key);
167 if (lru_it != lru_map_.end()) {
168 lru_.splice(lru_.begin(), lru_, lru_it->second);
172 logger->info(
"Cache HIT: key={:016x}, {} tokens, {} bytes",
173 key.hash, it->second.token_count,
174 it->second.data_size);
184 std::lock_guard<std::mutex> lock(mutex_);
189 logger->info(
"Prompt cache cleared");
199 std::lock_guard<std::mutex> lock(mutex_);
210 std::lock_guard<std::mutex> lock(mutex_);
211 return entries_.size();
221 std::lock_guard<std::mutex> lock(mutex_);
235void PromptCache::evict_until(
size_t needed_bytes) {
236 while (bytes_used_ + needed_bytes > max_bytes_ && !lru_.empty()) {
239 lru_map_.erase(victim);
241 auto it = entries_.find(victim);
242 if (it != entries_.end()) {
243 bytes_used_ -= it->second.data_size;
245 logger->info(
"Evicted cache entry: {} bytes freed",
246 it->second.data_size);
PromptCache(size_t max_bytes)
Construct with maximum RAM budget.
CacheStats stats() const
Cache hit/miss statistics.
static CacheKey make_key(std::string_view prompt_text, std::string_view model_path)
Compute a cache key from prompt text and model path.
const CacheEntry * lookup(const CacheKey &key)
Retrieve a cached KV snapshot.
bool store(const CacheKey &key, std::vector< uint8_t > &&data, int token_count)
Store a KV cache snapshot.
size_t entry_count() const
Number of cached entries.
void clear()
Evict all entries.
size_t bytes_used() const
Current total bytes consumed by cached entries.
spdlog initialization and logger access.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Activate model on GPU (WARM → ACTIVE).
Host-memory KV cache state storage with LRU eviction.
Single cached KV state snapshot.
std::vector< uint8_t > data
Raw KV cache bytes.
size_t data_size
data.size() for quick byte accounting
int token_count
Prompt tokens covered by this entry.
64-bit hash used as cache lookup key.
Cumulative cache performance counters.
uint64_t hits
Successful lookups.
size_t peak_bytes
High-water mark of bytes_used.
uint64_t evictions
LRU evictions.
uint64_t misses
Failed lookups.
uint64_t stores
Successful stores.