entropic/prompt__cache_8cpp_source.html

// SPDX-License-Identifier: Apache-2.0

#include "prompt_cache.h"


#include <entropic/types/logging.h>


#include <algorithm>


namespace entropic {


namespace {


auto logger = entropic::log::get("inference.prompt_cache");


uint64_t fnv1a_64(const char* data, size_t len) {

    uint64_t h = 14695981039346656037ULL;

    for (size_t i = 0; i < len; ++i) {

        h ^= static_cast<uint64_t>(static_cast<uint8_t>(data[i]));

        h *= 1099511628211ULL;

    }

    return h;

}


} // anonymous namespace


PromptCache::PromptCache(size_t max_bytes)

    : max_bytes_(max_bytes)

    , bytes_used_(0)

{

}


CacheKey PromptCache::make_key(std::string_view prompt_text,

                               std::string_view model_path)

{

    std::string combined;

    combined.reserve(prompt_text.size() + 1 + model_path.size());

    combined.append(prompt_text);

    combined.push_back('\0');

    combined.append(model_path);

    return CacheKey{fnv1a_64(combined.data(), combined.size())};

}


bool PromptCache::store(const CacheKey& key,

                        std::vector<uint8_t>&& data,

                        int token_count)

{

    std::lock_guard<std::mutex> lock(mutex_);


    size_t entry_size = data.size();


    if (max_bytes_ == 0) {

        return false;

    }


    if (entry_size > max_bytes_) {

        logger->warn("Cache entry ({} bytes) exceeds max_bytes ({}), skipping",

                     entry_size, max_bytes_);

        return false;

    }


    // Remove existing entry with same key if present

    auto it = entries_.find(key);

    if (it != entries_.end()) {

        bytes_used_ -= it->second.data_size;

        entries_.erase(it);

        auto lru_it = lru_map_.find(key);

        if (lru_it != lru_map_.end()) {

            lru_.erase(lru_it->second);

            lru_map_.erase(lru_it);

        }

    }


    evict_until(entry_size);


    CacheEntry entry;

    entry.data = std::move(data);

    entry.token_count = token_count;

    entry.data_size = entry_size;


    entries_[key] = std::move(entry);

    lru_.push_front(key);

    lru_map_[key] = lru_.begin();


    bytes_used_ += entry_size;

    ++stats_.stores;

    if (bytes_used_ > stats_.peak_bytes) {

        stats_.peak_bytes = bytes_used_;

    }


    logger->info("Cached prompt prefix: {} bytes, {} tokens, {} entries total",

                 entry_size, token_count, entries_.size());

    return true;

}


const CacheEntry* PromptCache::lookup(const CacheKey& key) {

    std::lock_guard<std::mutex> lock(mutex_);


    auto it = entries_.find(key);

    if (it == entries_.end()) {

        ++stats_.misses;

        logger->info("Cache MISS: key={:016x}, {} hits / {} misses",

                     key.hash, stats_.hits, stats_.misses);

        return nullptr;

    }


    // Move to front of LRU

    auto lru_it = lru_map_.find(key);

    if (lru_it != lru_map_.end()) {

        lru_.splice(lru_.begin(), lru_, lru_it->second);

    }


    ++stats_.hits;

    logger->info("Cache HIT: key={:016x}, {} tokens, {} bytes",

                 key.hash, it->second.token_count,

                 it->second.data_size);

    return &it->second;

}


void PromptCache::clear() {

    std::lock_guard<std::mutex> lock(mutex_);

    entries_.clear();

    lru_.clear();

    lru_map_.clear();

    bytes_used_ = 0;

    logger->info("Prompt cache cleared");

}


size_t PromptCache::bytes_used() const {

    std::lock_guard<std::mutex> lock(mutex_);

    return bytes_used_;

}


size_t PromptCache::entry_count() const {

    std::lock_guard<std::mutex> lock(mutex_);

    return entries_.size();

}


CacheStats PromptCache::stats() const {

    std::lock_guard<std::mutex> lock(mutex_);

    return stats_;

}


void PromptCache::evict_until(size_t needed_bytes) {

    while (bytes_used_ + needed_bytes > max_bytes_ && !lru_.empty()) {

        CacheKey victim = lru_.back();

        lru_.pop_back();

        lru_map_.erase(victim);


        auto it = entries_.find(victim);

        if (it != entries_.end()) {

            bytes_used_ -= it->second.data_size;

            ++stats_.evictions;

            logger->info("Evicted cache entry: {} bytes freed",

                         it->second.data_size);

            entries_.erase(it);

        }

    }

}


} // namespace entropic

entropic::PromptCache::PromptCache
PromptCache(size_t max_bytes)
Construct with maximum RAM budget.
Definition prompt_cache.cpp:49

entropic::PromptCache::stats
CacheStats stats() const
Cache hit/miss statistics.
Definition prompt_cache.cpp:220

entropic::PromptCache::make_key
static CacheKey make_key(std::string_view prompt_text, std::string_view model_path)
Compute a cache key from prompt text and model path.
Definition prompt_cache.cpp:67

entropic::PromptCache::lookup
const CacheEntry * lookup(const CacheKey &key)
Retrieve a cached KV snapshot.
Definition prompt_cache.cpp:154

entropic::PromptCache::store
bool store(const CacheKey &key, std::vector< uint8_t > &&data, int token_count)
Store a KV cache snapshot.
Definition prompt_cache.cpp:91

entropic::PromptCache::entry_count
size_t entry_count() const
Number of cached entries.
Definition prompt_cache.cpp:209

entropic::PromptCache::clear
void clear()
Evict all entries.
Definition prompt_cache.cpp:183

entropic::PromptCache::bytes_used
size_t bytes_used() const
Current total bytes consumed by cached entries.
Definition prompt_cache.cpp:198

logging.h
spdlog initialization and logger access.

entropic::log::get
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211

entropic
Activate model on GPU (WARM → ACTIVE).
Definition bundled_models.h:20

prompt_cache.h
Host-memory KV cache state storage with LRU eviction.

entropic::CacheEntry
Single cached KV state snapshot.
Definition prompt_cache.h:62

entropic::CacheEntry::data
std::vector< uint8_t > data
Raw KV cache bytes.
Definition prompt_cache.h:63

entropic::CacheEntry::data_size
size_t data_size
data.size() for quick byte accounting
Definition prompt_cache.h:65

entropic::CacheEntry::token_count
int token_count
Prompt tokens covered by this entry.
Definition prompt_cache.h:64

entropic::CacheKey
64-bit hash used as cache lookup key.
Definition prompt_cache.h:38

entropic::CacheStats
Cumulative cache performance counters.
Definition prompt_cache.h:72

entropic::CacheStats::hits
uint64_t hits
Successful lookups.
Definition prompt_cache.h:73

entropic::CacheStats::peak_bytes
size_t peak_bytes
High-water mark of bytes_used.
Definition prompt_cache.h:77

entropic::CacheStats::evictions
uint64_t evictions
LRU evictions.
Definition prompt_cache.h:75

entropic::CacheStats::misses
uint64_t misses
Failed lookups.
Definition prompt_cache.h:74

entropic::CacheStats::stores
uint64_t stores
Successful stores.
Definition prompt_cache.h:76