entropic/inference__c__api_8cpp_source.html

// SPDX-License-Identifier: Apache-2.0

#include <entropic/interfaces/i_inference_backend.h>

#include <entropic/entropic_export.h>

#include <entropic/types/logging.h>

#include <entropic/types/messages_json.h>


#include "llama_cpp_backend.h"


#include <nlohmann/json.hpp>


#include <atomic>

#include <cerrno>

#include <chrono>

#include <cstdlib>

#include <cstring>

#include <filesystem>

#include <mutex>

#include <optional>

#include <string>

#include <thread>


namespace {


auto logger = entropic::log::get("inference.c_api");


entropic::InferenceBackend* to_backend(entropic_inference_backend_t h) {

    return reinterpret_cast<entropic::InferenceBackend*>(h);

}


char* alloc_string(const std::string& s) {

    char* buf = static_cast<char*>(std::malloc(s.size() + 1));

    if (buf) {

        std::memcpy(buf, s.c_str(), s.size() + 1);

    }

    return buf;

}


template <typename T>

static void set_if(const nlohmann::json& j, const char* key, T& out) {

    if (j.contains(key)) { out = j[key].get<T>(); }

}


entropic::ModelConfig parse_config_json(const char* json_str) {

    entropic::ModelConfig config;

    auto j = nlohmann::json::parse(json_str);


    set_if(j, "path", config.path);

    set_if(j, "adapter", config.adapter);

    set_if(j, "context_length", config.context_length);

    set_if(j, "gpu_layers", config.gpu_layers);

    set_if(j, "keep_warm", config.keep_warm);

    set_if(j, "use_mlock", config.use_mlock);

    set_if(j, "n_batch", config.n_batch);

    set_if(j, "n_threads", config.n_threads);

    set_if(j, "flash_attn", config.flash_attn);


    return config;

}


template <typename T>

static void assign_if_present(const nlohmann::json& j,

                              const char* key, T& dst) {

    if (j.contains(key)) { dst = j[key].get<T>(); }

}


static void parse_logit_bias_into(

    const nlohmann::json& j,

    std::unordered_map<int32_t, float>& dst)

{

    if (!j.contains("logit_bias") || !j["logit_bias"].is_object()) {

        return;

    }

    for (auto it = j["logit_bias"].begin(); it != j["logit_bias"].end(); ++it) {

        try {

            dst[std::stoi(it.key())] = it.value().get<float>();

        } catch (const std::exception&) {

            // skip un-parseable keys

        }

    }

}


entropic::GenerationParams parse_params_json(const char* json_str) {

    entropic::GenerationParams params;

    auto j = nlohmann::json::parse(json_str);


    assign_if_present(j, "temperature",       params.temperature);

    assign_if_present(j, "top_p",             params.top_p);

    assign_if_present(j, "top_k",             params.top_k);

    assign_if_present(j, "min_p",             params.min_p);

    assign_if_present(j, "presence_penalty",  params.presence_penalty);

    assign_if_present(j, "frequency_penalty", params.frequency_penalty);

    assign_if_present(j, "repeat_penalty",    params.repeat_penalty);

    assign_if_present(j, "max_tokens",        params.max_tokens);

    assign_if_present(j, "grammar",           params.grammar);

    parse_logit_bias_into(j, params.logit_bias);


    return params;

}


std::string serialize_result_json(const entropic::GenerationResult& result) {

    nlohmann::json j;

    j["content"] = result.content;

    j["finish_reason"] = result.finish_reason;

    j["token_count"] = result.token_count;

    j["generation_time_ms"] = result.generation_time_ms;

    j["error_code"] = static_cast<int>(result.error_code);

    j["error_message"] = result.error_message;

    return j.dump();

}


/* parse_content_part + parse_messages_json moved to the shared

 * utility include/entropic/types/messages_json.h (v2.1.8, gh#37) so

 * the facade's entropic_run_messages can reuse them. Calls below

 * dispatch to entropic::parse_messages_json directly. */


} // anonymous namespace


// ── C API Implementation ───────────────────────────────────


extern "C" {


ENTROPIC_EXPORT entropic_error_t entropic_inference_load(

    entropic_inference_backend_t backend,

    const char* config_json)

{

    // v2.3.10: null-handle guard. Pre-v2.3.10 to_backend(nullptr)

    // returned nullptr and the next ->load() dereferenced it, taking

    // the whole process down with SIGSEGV. The plugin ABI is the

    // load-bearing boundary for misuse; rejecting null with a clean

    // error code is safer than crashing.

    if (!backend) { return ENTROPIC_ERROR_INVALID_ARGUMENT; }

    logger->info("C API: inference_load");

    try {

        auto config = parse_config_json(config_json);

        auto rc = to_backend(backend)->load(config)

            ? ENTROPIC_OK : ENTROPIC_ERROR_LOAD_FAILED;

        logger->info("C API: inference_load -> {}",

                     static_cast<int>(rc));

        return rc;

    } catch (const std::exception& e) {

        logger->error("inference_load exception: {}", e.what());

        return ENTROPIC_ERROR_LOAD_FAILED;

    }

}


ENTROPIC_EXPORT entropic_error_t entropic_inference_activate(

    entropic_inference_backend_t backend)

{

    if (!backend) { return ENTROPIC_ERROR_INVALID_ARGUMENT; }

    try {

        return to_backend(backend)->activate() ? ENTROPIC_OK : ENTROPIC_ERROR_LOAD_FAILED;

    } catch (const std::exception& e) {

        logger->error("inference_activate exception: {}", e.what());

        return ENTROPIC_ERROR_LOAD_FAILED;

    }

}


ENTROPIC_EXPORT entropic_error_t entropic_inference_deactivate(

    entropic_inference_backend_t backend)

{

    if (!backend) { return ENTROPIC_ERROR_INVALID_ARGUMENT; }

    try {

        to_backend(backend)->deactivate();

        return ENTROPIC_OK;

    } catch (const std::exception& e) {

        logger->error("inference_deactivate exception: {}", e.what());

        return ENTROPIC_ERROR_INTERNAL;

    }

}


ENTROPIC_EXPORT entropic_error_t entropic_inference_unload(

    entropic_inference_backend_t backend)

{

    if (!backend) { return ENTROPIC_ERROR_INVALID_ARGUMENT; }

    try {

        to_backend(backend)->unload();

        return ENTROPIC_OK;

    } catch (const std::exception& e) {

        logger->error("inference_unload exception: {}", e.what());

        return ENTROPIC_ERROR_INTERNAL;

    }

}


ENTROPIC_EXPORT int entropic_inference_state(

    entropic_inference_backend_t backend)

{

    // v2.3.10: null-handle returns COLD (0) — the safest "I don't

    // own anything" answer. Callers gating activation on state should

    // treat null + COLD identically anyway.

    if (!backend) { return static_cast<int>(ENTROPIC_MODEL_STATE_COLD); }

    return static_cast<int>(to_backend(backend)->state());

}


ENTROPIC_EXPORT entropic_error_t entropic_inference_generate(

    entropic_inference_backend_t backend,

    const char* messages_json,

    const char* params_json,

    char** result_json)

{

    if (!backend || !result_json) {

        return ENTROPIC_ERROR_INVALID_ARGUMENT;

    }

    logger->info("C API: inference_generate");

    try {

        auto msgs = entropic::parse_messages_json(messages_json);

        auto params = parse_params_json(params_json);

        auto result = to_backend(backend)->generate(msgs, params);

        *result_json = alloc_string(serialize_result_json(result));

        logger->info("C API: inference_generate -> {}",

                     result.ok() ? "ok" : "error");

        return result.ok() ? ENTROPIC_OK : result.error_code;

    } catch (const std::exception& e) {

        logger->error("inference_generate exception: {}", e.what());

        return ENTROPIC_ERROR_GENERATE_FAILED;

    }

}


namespace {


class CancelFlagBridge {

public:

    explicit CancelFlagBridge(int* cancel_flag) {

        if (cancel_flag == nullptr) { return; }

        poller_ = std::thread([this, cancel_flag]() {

            while (!done_.load(std::memory_order_acquire)) {

                if (*cancel_flag != 0) {

                    cancel_.store(true, std::memory_order_release);

                    return;

                }

                std::this_thread::sleep_for(std::chrono::milliseconds(10));

            }

        });

    }


    ~CancelFlagBridge() {

        done_.store(true, std::memory_order_release);

        if (poller_.joinable()) { poller_.join(); }

    }


    CancelFlagBridge(const CancelFlagBridge&) = delete;

    CancelFlagBridge& operator=(const CancelFlagBridge&) = delete;


    std::atomic<bool>& flag() { return cancel_; }


private:

    std::atomic<bool> cancel_{false};

    std::atomic<bool> done_{false};

    std::thread poller_;

};


}  // namespace


ENTROPIC_EXPORT entropic_error_t entropic_inference_generate_with_cancel(

    entropic_inference_backend_t backend,

    const char* messages_json,

    const char* params_json,

    char** result_json,

    int* cancel_flag)

{

    if (!backend || !result_json) {

        return ENTROPIC_ERROR_INVALID_ARGUMENT;

    }

    logger->info("C API: inference_generate_with_cancel");

    try {

        auto msgs = entropic::parse_messages_json(messages_json);

        auto params = parse_params_json(params_json);

        CancelFlagBridge bridge(cancel_flag);

        auto result = to_backend(backend)->generate(

            msgs, params, bridge.flag());

        *result_json = alloc_string(serialize_result_json(result));

        logger->info("C API: inference_generate_with_cancel -> {}",

                     result.ok() ? "ok" : "error");

        return result.ok() ? ENTROPIC_OK : result.error_code;

    } catch (const std::exception& e) {

        logger->error("inference_generate_with_cancel exception: {}", e.what());

        return ENTROPIC_ERROR_GENERATE_FAILED;

    }

}


ENTROPIC_EXPORT entropic_error_t entropic_inference_generate_streaming(

    entropic_inference_backend_t backend,

    const char* messages_json,

    const char* params_json,

    void (*on_token)(const char* token, size_t len, void* user_data),

    void* user_data,

    int* cancel_flag)

{

    if (!backend || !on_token) {

        return ENTROPIC_ERROR_INVALID_ARGUMENT;

    }

    try {

        auto msgs = entropic::parse_messages_json(messages_json);

        auto params = parse_params_json(params_json);

        std::atomic<bool> cancel{false};


        auto callback = [on_token, user_data, cancel_flag, &cancel]

            (std::string_view token) {

                on_token(token.data(), token.size(), user_data);

                if (cancel_flag && *cancel_flag) {

                    cancel.store(true, std::memory_order_release);

                }

            };


        auto result = to_backend(backend)->generate_streaming(

            msgs, params, callback, cancel);

        return result.ok() ? ENTROPIC_OK : result.error_code;

    } catch (const std::exception& e) {

        logger->error("inference_generate_streaming exception: {}", e.what());

        return ENTROPIC_ERROR_GENERATE_FAILED;

    }

}


ENTROPIC_EXPORT entropic_error_t entropic_inference_complete(

    entropic_inference_backend_t backend,

    const char* prompt,

    const char* params_json,

    char** result_json)

{

    if (!backend || !prompt || !result_json) {

        return ENTROPIC_ERROR_INVALID_ARGUMENT;

    }

    try {

        auto params = parse_params_json(params_json);

        auto result = to_backend(backend)->complete(prompt, params);

        *result_json = alloc_string(serialize_result_json(result));

        return result.ok() ? ENTROPIC_OK : result.error_code;

    } catch (const std::exception& e) {

        logger->error("inference_complete exception: {}", e.what());

        return ENTROPIC_ERROR_GENERATE_FAILED;

    }

}


ENTROPIC_EXPORT int entropic_inference_count_tokens(

    entropic_inference_backend_t backend,

    const char* text,

    size_t text_len)

{

    // v2.3.10: null guards. text=nullptr is a real misuse (caller

    // owes us bytes). Returning 0 keeps the contract simple (no

    // tokens to count when there's no string).

    if (!backend || !text) { return 0; }

    try {

        return to_backend(backend)->count_tokens(std::string(text, text_len));

    } catch (...) {

        return static_cast<int>(text_len) / 4;

    }

}


ENTROPIC_EXPORT void entropic_inference_destroy(

    entropic_inference_backend_t backend)

{

    delete to_backend(backend);

}


ENTROPIC_EXPORT void entropic_inference_free(void* ptr) {

    std::free(ptr);

}


ENTROPIC_EXPORT entropic_inference_backend_t entropic_create_inference_backend() {

    return reinterpret_cast<entropic_inference_backend_t>(

        new entropic::LlamaCppBackend());

}


ENTROPIC_EXPORT int entropic_plugin_api_version() {

    return 1;

}


// ── Log redirect (v2.0.1) ──────────────────────────────────


static FILE* s_ggml_log_fp = nullptr;

// llama.cpp's llama_log_set is a single-slot process global. Track

// the active path so a second handle in the same process gets a

// predictable answer: same path → no-op (don't truncate the first

// handle's live log), conflicting path → reject with a warning

// (rather than clobber).

static std::mutex s_ggml_log_mu;

static std::optional<std::string> s_ggml_log_path;


static void ggml_log_to_file(enum ggml_log_level /*level*/,

                             const char* text, void* /*ud*/) {

    if (s_ggml_log_fp && text) {

        fputs(text, s_ggml_log_fp);

        fflush(s_ggml_log_fp);

    }

}


static void ggml_log_noop(enum ggml_log_level /*level*/,

                          const char* /*text*/, void* /*ud*/) {

}


static void ggml_log_silence_locked() {

    if (s_ggml_log_fp) {

        fclose(s_ggml_log_fp);

        s_ggml_log_fp = nullptr;

    }

    s_ggml_log_path.reset();

    llama_log_set(ggml_log_noop, nullptr);

}


static std::string canonicalize_or_passthrough(const char* path) {

    std::error_code ec;

    auto canonical = std::filesystem::weakly_canonical(path, ec).string();

    return ec ? std::string(path) : canonical;

}


void entropic_inference_log_to_file(const char* path) {

    std::lock_guard lk(s_ggml_log_mu);


    if (!path || path[0] == '\0') {

        ggml_log_silence_locked();

        return;

    }

    auto canonical = canonicalize_or_passthrough(path);


    // llama_log_set has one process-global slot; first-call wins so a

    // second handle's redirect cannot clobber the first.

    if (s_ggml_log_path && *s_ggml_log_path != canonical) {

        logger->warn(

            "ggml log redirect already wired to {}; ignoring request for {}",

            *s_ggml_log_path, canonical);

        return;

    }


    FILE* fp = fopen(path, "w");

    if (!fp) {

        logger->warn("ggml log fopen failed for {}: {}",

                     path, std::strerror(errno));

        return;

    }

    if (s_ggml_log_fp) { fclose(s_ggml_log_fp); }

    s_ggml_log_fp = fp;

    s_ggml_log_path = canonical;

    llama_log_set(ggml_log_to_file, nullptr);

}


void entropic_inference_log_silence(void) {

    std::lock_guard lk(s_ggml_log_mu);

    ggml_log_silence_locked();

}


} // extern "C"

entropic::InferenceBackend
Concrete base class for inference backends (80% logic).
Definition backend.h:69

entropic::LlamaCppBackend
LlamaCppBackend — common llama.cpp patterns (15% layer).
Definition llama_cpp_backend.h:65

entropic_export.h
Symbol visibility macro for all exported symbols.

ENTROPIC_MODEL_STATE_COLD
@ ENTROPIC_MODEL_STATE_COLD
On disk only, no RAM consumed.
Definition enums.h:28

entropic_error_t
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35

ENTROPIC_OK
@ ENTROPIC_OK
Success.
Definition error.h:36

ENTROPIC_ERROR_INTERNAL
@ ENTROPIC_ERROR_INTERNAL
Unexpected internal error (bug)
Definition error.h:51

ENTROPIC_ERROR_INVALID_ARGUMENT
@ ENTROPIC_ERROR_INVALID_ARGUMENT
NULL pointer, empty string, out-of-range value.
Definition error.h:37

ENTROPIC_ERROR_GENERATE_FAILED
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
Definition error.h:42

ENTROPIC_ERROR_LOAD_FAILED
@ ENTROPIC_ERROR_LOAD_FAILED
Model load failed (corrupt file, OOM, unsupported format)
Definition error.h:41

i_inference_backend.h
Pure C interface contract for inference backends.

entropic_inference_backend_t
struct entropic_inference_backend * entropic_inference_backend_t
Opaque handle to an inference backend instance.
Definition i_inference_backend.h:42

entropic_inference_complete
ENTROPIC_EXPORT entropic_error_t entropic_inference_complete(entropic_inference_backend_t backend, const char *prompt, const char *params_json, char **result_json)
Plugin C API: raw text completion without chat template.
Definition inference_c_api.cpp:519

ggml_log_to_file
static void ggml_log_to_file(enum ggml_log_level, const char *text, void *)
Callback that writes to the ggml log file.
Definition inference_c_api.cpp:623

entropic_inference_generate_streaming
ENTROPIC_EXPORT entropic_error_t entropic_inference_generate_streaming(entropic_inference_backend_t backend, const char *messages_json, const char *params_json, void(*on_token)(const char *token, size_t len, void *user_data), void *user_data, int *cancel_flag)
Plugin C API: streaming generation with token callback and cancel flag.
Definition inference_c_api.cpp:476

entropic_inference_generate
ENTROPIC_EXPORT entropic_error_t entropic_inference_generate(entropic_inference_backend_t backend, const char *messages_json, const char *params_json, char **result_json)
Plugin C API: blocking generation returning full result.
Definition inference_c_api.cpp:317

entropic_plugin_api_version
ENTROPIC_EXPORT int entropic_plugin_api_version()
Plugin API version.
Definition inference_c_api.cpp:603

canonicalize_or_passthrough
static std::string canonicalize_or_passthrough(const char *path)
Resolve path via weakly_canonical, fall back to raw on error.
Definition inference_c_api.cpp:665

entropic_inference_unload
ENTROPIC_EXPORT entropic_error_t entropic_inference_unload(entropic_inference_backend_t backend)
Plugin C API: release the loaded model (transition to COLD).
Definition inference_c_api.cpp:277

entropic_inference_generate_with_cancel
ENTROPIC_EXPORT entropic_error_t entropic_inference_generate_with_cancel(entropic_inference_backend_t backend, const char *messages_json, const char *params_json, char **result_json, int *cancel_flag)
Plugin C API: batch generation with mid-decode cancel.
Definition inference_c_api.cpp:437

entropic_inference_log_silence
void entropic_inference_log_silence(void)
Silence all llama/ggml output.
Definition inference_c_api.cpp:717

entropic_inference_count_tokens
ENTROPIC_EXPORT int entropic_inference_count_tokens(entropic_inference_backend_t backend, const char *text, size_t text_len)
Plugin C API: count tokens for a text span (exact when loaded, estimate when COLD).
Definition inference_c_api.cpp:548

entropic_inference_deactivate
ENTROPIC_EXPORT entropic_error_t entropic_inference_deactivate(entropic_inference_backend_t backend)
Plugin C API: demote backend from ACTIVE to WARM (release GPU).
Definition inference_c_api.cpp:257

ggml_log_silence_locked
static void ggml_log_silence_locked()
Redirect llama/ggml logs to a file or silence them.
Definition inference_c_api.cpp:651

entropic_inference_log_to_file
void entropic_inference_log_to_file(const char *path)
Redirect llama/ggml logs to a file or silence them.
Definition inference_c_api.cpp:682

entropic_inference_load
ENTROPIC_EXPORT entropic_error_t entropic_inference_load(entropic_inference_backend_t backend, const char *config_json)
Plugin C API: load a model into the inference backend.
Definition inference_c_api.cpp:207

entropic_create_inference_backend
ENTROPIC_EXPORT entropic_inference_backend_t entropic_create_inference_backend()
Factory: create inference backend instance.
Definition inference_c_api.cpp:592

ggml_log_noop
static void ggml_log_noop(enum ggml_log_level, const char *, void *)
No-op callback.
Definition inference_c_api.cpp:636

entropic_inference_free
ENTROPIC_EXPORT void entropic_inference_free(void *ptr)
Plugin C API: free memory allocated by the inference backend.
Definition inference_c_api.cpp:582

entropic_inference_destroy
ENTROPIC_EXPORT void entropic_inference_destroy(entropic_inference_backend_t backend)
Plugin C API: destroy the backend and free its resources.
Definition inference_c_api.cpp:570

entropic_inference_activate
ENTROPIC_EXPORT entropic_error_t entropic_inference_activate(entropic_inference_backend_t backend)
Plugin C API: promote backend from WARM to ACTIVE (GPU load).
Definition inference_c_api.cpp:238

entropic_inference_state
ENTROPIC_EXPORT int entropic_inference_state(entropic_inference_backend_t backend)
Plugin C API: query current lifecycle state (lock-free).
Definition inference_c_api.cpp:297

llama_cpp_backend.h
LlamaCppBackend — llama.cpp C API integration.

logging.h
spdlog initialization and logger access.

entropic::log::get
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211

alloc_string
static char * alloc_string(const std::string &s)
Allocate a C string copy for caller-owned return.
Definition mcp_c_api.cpp:41

messages_json.h
Shared parser: messages-JSON wire format → vector<Message>.

entropic::parse_messages_json
std::vector< Message > parse_messages_json(const char *json_str)
Parse a JSON array of messages into a vector of Message.
Definition messages_json.cpp:68

entropic::GenerationParams
Generation parameters for a single inference call.
Definition config.h:302

entropic::GenerationParams::grammar
std::string grammar
GBNF grammar string (empty = unconstrained)
Definition config.h:359

entropic::GenerationParams::top_k
int top_k
Top-K sampling.
Definition config.h:305

entropic::GenerationParams::logit_bias
std::unordered_map< int32_t, float > logit_bias
Per-token logit bias map (gh#23 MVP item 4).
Definition config.h:336

entropic::GenerationParams::repeat_penalty
float repeat_penalty
Repetition penalty.
Definition config.h:306

entropic::GenerationParams::temperature
float temperature
Sampling temperature.
Definition config.h:303

entropic::GenerationParams::frequency_penalty
float frequency_penalty
Frequency-penalty term in llama.cpp's penalties sampler (gh#23 MVP item 3).
Definition config.h:349

entropic::GenerationParams::presence_penalty
float presence_penalty
Presence-penalty term in llama.cpp's penalties sampler (gh#23 MVP item 2).
Definition config.h:322

entropic::GenerationParams::min_p
float min_p
Min-p nucleus sampling threshold (gh#23 MVP item 1).
Definition config.h:315

entropic::GenerationParams::max_tokens
int max_tokens
Maximum tokens to generate.
Definition config.h:351

entropic::GenerationParams::top_p
float top_p
Nucleus sampling threshold.
Definition config.h:304

entropic::GenerationResult
Result of a single generation call.
Definition generation_result.h:30

entropic::GenerationResult::error_code
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
Definition generation_result.h:82

entropic::GenerationResult::generation_time_ms
double generation_time_ms
Wall-clock generation time.
Definition generation_result.h:36

entropic::GenerationResult::finish_reason
std::string finish_reason
Finish reason: "stop", "length", "error".
Definition generation_result.h:34

entropic::GenerationResult::content
std::string content
Generated text (cleaned by adapter)
Definition generation_result.h:31

entropic::GenerationResult::error_message
std::string error_message
Error description (empty if no error)
Definition generation_result.h:83

entropic::GenerationResult::token_count
int token_count
Generated token count.
Definition generation_result.h:35

entropic::ModelConfig
Model configuration for a single tier.
Definition config.h:148

entropic::ModelConfig::gpu_layers
int gpu_layers
GPU offload layers (-1 = all)
Definition config.h:152

entropic::ModelConfig::context_length
int context_length
Context window size (512–131072)
Definition config.h:151

entropic::ModelConfig::path
std::filesystem::path path
Resolved model file path.
Definition config.h:149

entropic::ModelConfig::n_threads
int n_threads
CPU threads (0 = auto-detect)
Definition config.h:174

entropic::ModelConfig::keep_warm
bool keep_warm
Pre-warm model at startup.
Definition config.h:153

entropic::ModelConfig::n_batch
int n_batch
Batch size for prompt processing.
Definition config.h:160

entropic::ModelConfig::flash_attn
bool flash_attn
Enable flash attention.
Definition config.h:233

entropic::ModelConfig::use_mlock
bool use_mlock
Lock model in system RAM.
Definition config.h:154

entropic::ModelConfig::adapter
std::string adapter
Chat adapter name.
Definition config.h:150