19#include <nlohmann/json.hpp>
53 char* buf =
static_cast<char*
>(std::malloc(s.size() + 1));
55 std::memcpy(buf, s.c_str(), s.size() + 1);
76static void set_if(
const nlohmann::json& j,
const char* key, T& out) {
77 if (j.contains(key)) { out = j[key].get<T>(); }
87 auto j = nlohmann::json::parse(json_str);
89 set_if(j,
"path", config.
path);
90 set_if(j,
"adapter", config.
adapter);
95 set_if(j,
"n_batch", config.
n_batch);
111 auto j = nlohmann::json::parse(json_str);
113 if (j.contains(
"temperature")) params.
temperature = j[
"temperature"].get<
float>();
114 if (j.contains(
"top_p")) params.
top_p = j[
"top_p"].get<
float>();
115 if (j.contains(
"top_k")) params.
top_k = j[
"top_k"].get<
int>();
116 if (j.contains(
"repeat_penalty")) params.
repeat_penalty = j[
"repeat_penalty"].get<
float>();
117 if (j.contains(
"max_tokens")) params.
max_tokens = j[
"max_tokens"].get<
int>();
118 if (j.contains(
"grammar")) params.
grammar = j[
"grammar"].get<std::string>();
136 j[
"error_code"] =
static_cast<int>(result.
error_code);
162 const char* config_json)
164 logger->info(
"C API: inference_load");
166 auto config = parse_config_json(config_json);
167 auto rc = to_backend(backend)->load(config)
169 logger->info(
"C API: inference_load -> {}",
170 static_cast<int>(rc));
172 }
catch (
const std::exception& e) {
173 logger->error(
"inference_load exception: {}", e.what());
190 }
catch (
const std::exception& e) {
191 logger->error(
"inference_activate exception: {}", e.what());
207 to_backend(backend)->deactivate();
209 }
catch (
const std::exception& e) {
210 logger->error(
"inference_deactivate exception: {}", e.what());
226 to_backend(backend)->unload();
228 }
catch (
const std::exception& e) {
229 logger->error(
"inference_unload exception: {}", e.what());
244 return static_cast<int>(to_backend(backend)->state());
259 const char* messages_json,
260 const char* params_json,
263 logger->info(
"C API: inference_generate");
266 auto params = parse_params_json(params_json);
267 auto result = to_backend(backend)->generate(msgs, params);
268 *result_json =
alloc_string(serialize_result_json(result));
269 logger->info(
"C API: inference_generate -> {}",
270 result.ok() ?
"ok" :
"error");
271 return result.ok() ?
ENTROPIC_OK : result.error_code;
272 }
catch (
const std::exception& e) {
273 logger->error(
"inference_generate exception: {}", e.what());
292 const char* messages_json,
293 const char* params_json,
294 void (*on_token)(
const char* token,
size_t len,
void* user_data),
300 auto params = parse_params_json(params_json);
301 std::atomic<bool> cancel{
false};
303 auto callback = [on_token, user_data, cancel_flag, &cancel]
304 (std::string_view token) {
305 on_token(token.data(), token.size(), user_data);
306 if (cancel_flag && *cancel_flag) {
307 cancel.store(
true, std::memory_order_release);
311 auto result = to_backend(backend)->generate_streaming(
312 msgs, params, callback, cancel);
313 return result.ok() ?
ENTROPIC_OK : result.error_code;
314 }
catch (
const std::exception& e) {
315 logger->error(
"inference_generate_streaming exception: {}", e.what());
333 const char* params_json,
337 auto params = parse_params_json(params_json);
338 auto result = to_backend(backend)->complete(prompt, params);
339 *result_json =
alloc_string(serialize_result_json(result));
340 return result.ok() ?
ENTROPIC_OK : result.error_code;
341 }
catch (
const std::exception& e) {
342 logger->error(
"inference_complete exception: {}", e.what());
362 return to_backend(backend)->count_tokens(std::string(text, text_len));
364 return static_cast<int>(text_len) / 4;
377 delete to_backend(backend);
413static FILE* s_ggml_log_fp =
nullptr;
419static std::mutex s_ggml_log_mu;
420static std::optional<std::string> s_ggml_log_path;
428 const char* text,
void* ) {
429 if (s_ggml_log_fp && text) {
430 fputs(text, s_ggml_log_fp);
431 fflush(s_ggml_log_fp);
441 const char* ,
void* ) {
457 fclose(s_ggml_log_fp);
458 s_ggml_log_fp =
nullptr;
460 s_ggml_log_path.reset();
471 auto canonical = std::filesystem::weakly_canonical(path, ec).string();
472 return ec ? std::string(path) : canonical;
487 std::lock_guard lk(s_ggml_log_mu);
489 if (!path || path[0] ==
'\0') {
497 if (s_ggml_log_path && *s_ggml_log_path != canonical) {
499 "ggml log redirect already wired to {}; ignoring request for {}",
500 *s_ggml_log_path, canonical);
504 FILE* fp = fopen(path,
"w");
506 logger->warn(
"ggml log fopen failed for {}: {}",
507 path, std::strerror(errno));
510 if (s_ggml_log_fp) { fclose(s_ggml_log_fp); }
512 s_ggml_log_path = canonical;
522 std::lock_guard lk(s_ggml_log_mu);
Concrete base class for inference backends (80% logic).
LlamaCppBackend — common llama.cpp patterns (15% layer).
Symbol visibility macro for all exported symbols.
entropic_error_t
Error codes returned by all C API functions.
@ ENTROPIC_ERROR_INTERNAL
Unexpected internal error (bug)
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
@ ENTROPIC_ERROR_LOAD_FAILED
Model load failed (corrupt file, OOM, unsupported format)
Pure C interface contract for inference backends.
struct entropic_inference_backend * entropic_inference_backend_t
Opaque handle to an inference backend instance.
ENTROPIC_EXPORT entropic_error_t entropic_inference_complete(entropic_inference_backend_t backend, const char *prompt, const char *params_json, char **result_json)
Plugin C API: raw text completion without chat template.
static void ggml_log_to_file(enum ggml_log_level, const char *text, void *)
Callback that writes to the ggml log file.
ENTROPIC_EXPORT entropic_error_t entropic_inference_generate_streaming(entropic_inference_backend_t backend, const char *messages_json, const char *params_json, void(*on_token)(const char *token, size_t len, void *user_data), void *user_data, int *cancel_flag)
Plugin C API: streaming generation with token callback and cancel flag.
ENTROPIC_EXPORT entropic_error_t entropic_inference_generate(entropic_inference_backend_t backend, const char *messages_json, const char *params_json, char **result_json)
Plugin C API: blocking generation returning full result.
ENTROPIC_EXPORT int entropic_plugin_api_version()
Plugin API version.
static std::string canonicalize_or_passthrough(const char *path)
Resolve path via weakly_canonical, fall back to raw on error.
ENTROPIC_EXPORT entropic_error_t entropic_inference_unload(entropic_inference_backend_t backend)
Plugin C API: release the loaded model (transition to COLD).
void entropic_inference_log_silence(void)
Silence all llama/ggml output.
ENTROPIC_EXPORT int entropic_inference_count_tokens(entropic_inference_backend_t backend, const char *text, size_t text_len)
Plugin C API: count tokens for a text span (exact when loaded, estimate when COLD).
ENTROPIC_EXPORT entropic_error_t entropic_inference_deactivate(entropic_inference_backend_t backend)
Plugin C API: demote backend from ACTIVE to WARM (release GPU).
static void ggml_log_silence_locked()
Redirect llama/ggml logs to a file or silence them.
void entropic_inference_log_to_file(const char *path)
Redirect llama/ggml logs to a file or silence them.
ENTROPIC_EXPORT entropic_error_t entropic_inference_load(entropic_inference_backend_t backend, const char *config_json)
Plugin C API: load a model into the inference backend.
ENTROPIC_EXPORT entropic_inference_backend_t entropic_create_inference_backend()
Factory: create inference backend instance.
static void ggml_log_noop(enum ggml_log_level, const char *, void *)
No-op callback.
ENTROPIC_EXPORT void entropic_inference_free(void *ptr)
Plugin C API: free memory allocated by the inference backend.
ENTROPIC_EXPORT void entropic_inference_destroy(entropic_inference_backend_t backend)
Plugin C API: destroy the backend and free its resources.
ENTROPIC_EXPORT entropic_error_t entropic_inference_activate(entropic_inference_backend_t backend)
Plugin C API: promote backend from WARM to ACTIVE (GPU load).
ENTROPIC_EXPORT int entropic_inference_state(entropic_inference_backend_t backend)
Plugin C API: query current lifecycle state (lock-free).
LlamaCppBackend — llama.cpp C API integration.
spdlog initialization and logger access.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
static char * alloc_string(const std::string &s)
Allocate a C string copy for caller-owned return.
Shared parser: messages-JSON wire format → vector<Message>.
std::vector< Message > parse_messages_json(const char *json_str)
Parse a JSON array of messages into a vector of Message.
Generation parameters for a single inference call.
std::string grammar
GBNF grammar string (empty = unconstrained)
float repeat_penalty
Repetition penalty.
float temperature
Sampling temperature.
int max_tokens
Maximum tokens to generate.
float top_p
Nucleus sampling threshold.
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
double generation_time_ms
Wall-clock generation time.
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string content
Generated text (cleaned by adapter)
std::string error_message
Error description (empty if no error)
int token_count
Generated token count.
Model configuration for a single tier.
int gpu_layers
GPU offload layers (-1 = all)
int context_length
Context window size (512–131072)
std::filesystem::path path
Resolved model file path.
int n_threads
CPU threads (0 = auto-detect)
bool keep_warm
Pre-warm model at startup.
int n_batch
Batch size for prompt processing.
bool flash_attn
Enable flash attention.
bool use_mlock
Lock model in system RAM.
std::string adapter
Chat adapter name.