Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
inference_c_api.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
16
17#include "llama_cpp_backend.h"
18
19#include <nlohmann/json.hpp>
20
21#include <atomic>
22#include <cerrno>
23#include <cstdlib>
24#include <cstring>
25#include <filesystem>
26#include <mutex>
27#include <optional>
28#include <string>
29
30namespace {
31
32auto logger = entropic::log::get("inference.c_api");
33
42 return reinterpret_cast<entropic::InferenceBackend*>(h);
43}
44
52char* alloc_string(const std::string& s) {
53 char* buf = static_cast<char*>(std::malloc(s.size() + 1));
54 if (buf) {
55 std::memcpy(buf, s.c_str(), s.size() + 1);
56 }
57 return buf;
58}
59
75template <typename T>
76static void set_if(const nlohmann::json& j, const char* key, T& out) {
77 if (j.contains(key)) { out = j[key].get<T>(); }
78}
79
85entropic::ModelConfig parse_config_json(const char* json_str) {
87 auto j = nlohmann::json::parse(json_str);
88
89 set_if(j, "path", config.path);
90 set_if(j, "adapter", config.adapter);
91 set_if(j, "context_length", config.context_length);
92 set_if(j, "gpu_layers", config.gpu_layers);
93 set_if(j, "keep_warm", config.keep_warm);
94 set_if(j, "use_mlock", config.use_mlock);
95 set_if(j, "n_batch", config.n_batch);
96 set_if(j, "n_threads", config.n_threads);
97 set_if(j, "flash_attn", config.flash_attn);
98
99 return config;
100}
101
109entropic::GenerationParams parse_params_json(const char* json_str) {
111 auto j = nlohmann::json::parse(json_str);
112
113 if (j.contains("temperature")) params.temperature = j["temperature"].get<float>();
114 if (j.contains("top_p")) params.top_p = j["top_p"].get<float>();
115 if (j.contains("top_k")) params.top_k = j["top_k"].get<int>();
116 if (j.contains("repeat_penalty")) params.repeat_penalty = j["repeat_penalty"].get<float>();
117 if (j.contains("max_tokens")) params.max_tokens = j["max_tokens"].get<int>();
118 if (j.contains("grammar")) params.grammar = j["grammar"].get<std::string>();
119
120 return params;
121}
122
130std::string serialize_result_json(const entropic::GenerationResult& result) {
131 nlohmann::json j;
132 j["content"] = result.content;
133 j["finish_reason"] = result.finish_reason;
134 j["token_count"] = result.token_count;
135 j["generation_time_ms"] = result.generation_time_ms;
136 j["error_code"] = static_cast<int>(result.error_code);
137 j["error_message"] = result.error_message;
138 return j.dump();
139}
140
141/* parse_content_part + parse_messages_json moved to the shared
142 * utility include/entropic/types/messages_json.h (v2.1.8, gh#37) so
143 * the facade's entropic_run_messages can reuse them. Calls below
144 * dispatch to entropic::parse_messages_json directly. */
145
146} // anonymous namespace
147
148// ── C API Implementation ───────────────────────────────────
149
150extern "C" {
151
162 const char* config_json)
163{
164 logger->info("C API: inference_load");
165 try {
166 auto config = parse_config_json(config_json);
167 auto rc = to_backend(backend)->load(config)
169 logger->info("C API: inference_load -> {}",
170 static_cast<int>(rc));
171 return rc;
172 } catch (const std::exception& e) {
173 logger->error("inference_load exception: {}", e.what());
175 }
176}
177
187{
188 try {
189 return to_backend(backend)->activate() ? ENTROPIC_OK : ENTROPIC_ERROR_LOAD_FAILED;
190 } catch (const std::exception& e) {
191 logger->error("inference_activate exception: {}", e.what());
193 }
194}
195
205{
206 try {
207 to_backend(backend)->deactivate();
208 return ENTROPIC_OK;
209 } catch (const std::exception& e) {
210 logger->error("inference_deactivate exception: {}", e.what());
212 }
213}
214
224{
225 try {
226 to_backend(backend)->unload();
227 return ENTROPIC_OK;
228 } catch (const std::exception& e) {
229 logger->error("inference_unload exception: {}", e.what());
231 }
232}
233
241ENTROPIC_EXPORT int entropic_inference_state(
243{
244 return static_cast<int>(to_backend(backend)->state());
245}
246
259 const char* messages_json,
260 const char* params_json,
261 char** result_json)
262{
263 logger->info("C API: inference_generate");
264 try {
265 auto msgs = entropic::parse_messages_json(messages_json);
266 auto params = parse_params_json(params_json);
267 auto result = to_backend(backend)->generate(msgs, params);
268 *result_json = alloc_string(serialize_result_json(result));
269 logger->info("C API: inference_generate -> {}",
270 result.ok() ? "ok" : "error");
271 return result.ok() ? ENTROPIC_OK : result.error_code;
272 } catch (const std::exception& e) {
273 logger->error("inference_generate exception: {}", e.what());
275 }
276}
277
292 const char* messages_json,
293 const char* params_json,
294 void (*on_token)(const char* token, size_t len, void* user_data),
295 void* user_data,
296 int* cancel_flag)
297{
298 try {
299 auto msgs = entropic::parse_messages_json(messages_json);
300 auto params = parse_params_json(params_json);
301 std::atomic<bool> cancel{false};
302
303 auto callback = [on_token, user_data, cancel_flag, &cancel]
304 (std::string_view token) {
305 on_token(token.data(), token.size(), user_data);
306 if (cancel_flag && *cancel_flag) {
307 cancel.store(true, std::memory_order_release);
308 }
309 };
310
311 auto result = to_backend(backend)->generate_streaming(
312 msgs, params, callback, cancel);
313 return result.ok() ? ENTROPIC_OK : result.error_code;
314 } catch (const std::exception& e) {
315 logger->error("inference_generate_streaming exception: {}", e.what());
317 }
318}
319
332 const char* prompt,
333 const char* params_json,
334 char** result_json)
335{
336 try {
337 auto params = parse_params_json(params_json);
338 auto result = to_backend(backend)->complete(prompt, params);
339 *result_json = alloc_string(serialize_result_json(result));
340 return result.ok() ? ENTROPIC_OK : result.error_code;
341 } catch (const std::exception& e) {
342 logger->error("inference_complete exception: {}", e.what());
344 }
345}
346
358 const char* text,
359 size_t text_len)
360{
361 try {
362 return to_backend(backend)->count_tokens(std::string(text, text_len));
363 } catch (...) {
364 return static_cast<int>(text_len) / 4;
365 }
366}
367
374ENTROPIC_EXPORT void entropic_inference_destroy(
376{
377 delete to_backend(backend);
378}
379
386ENTROPIC_EXPORT void entropic_inference_free(void* ptr) {
387 std::free(ptr);
388}
389
400
407ENTROPIC_EXPORT int entropic_plugin_api_version() {
408 return 1;
409}
410
411// ── Log redirect (v2.0.1) ──────────────────────────────────
412
413static FILE* s_ggml_log_fp = nullptr;
414// llama.cpp's llama_log_set is a single-slot process global. Track
415// the active path so a second handle in the same process gets a
416// predictable answer: same path → no-op (don't truncate the first
417// handle's live log), conflicting path → reject with a warning
418// (rather than clobber).
419static std::mutex s_ggml_log_mu;
420static std::optional<std::string> s_ggml_log_path;
421
427static void ggml_log_to_file(enum ggml_log_level /*level*/,
428 const char* text, void* /*ud*/) {
429 if (s_ggml_log_fp && text) {
430 fputs(text, s_ggml_log_fp);
431 fflush(s_ggml_log_fp);
432 }
433}
434
440static void ggml_log_noop(enum ggml_log_level /*level*/,
441 const char* /*text*/, void* /*ud*/) {
442}
443
456 if (s_ggml_log_fp) {
457 fclose(s_ggml_log_fp);
458 s_ggml_log_fp = nullptr;
459 }
460 s_ggml_log_path.reset();
461 llama_log_set(ggml_log_noop, nullptr);
462}
463
469static std::string canonicalize_or_passthrough(const char* path) {
470 std::error_code ec;
471 auto canonical = std::filesystem::weakly_canonical(path, ec).string();
472 return ec ? std::string(path) : canonical;
473}
474
486void entropic_inference_log_to_file(const char* path) {
487 std::lock_guard lk(s_ggml_log_mu);
488
489 if (!path || path[0] == '\0') {
491 return;
492 }
493 auto canonical = canonicalize_or_passthrough(path);
494
495 // llama_log_set has one process-global slot; first-call wins so a
496 // second handle's redirect cannot clobber the first.
497 if (s_ggml_log_path && *s_ggml_log_path != canonical) {
498 logger->warn(
499 "ggml log redirect already wired to {}; ignoring request for {}",
500 *s_ggml_log_path, canonical);
501 return;
502 }
503
504 FILE* fp = fopen(path, "w");
505 if (!fp) {
506 logger->warn("ggml log fopen failed for {}: {}",
507 path, std::strerror(errno));
508 return;
509 }
510 if (s_ggml_log_fp) { fclose(s_ggml_log_fp); }
511 s_ggml_log_fp = fp;
512 s_ggml_log_path = canonical;
513 llama_log_set(ggml_log_to_file, nullptr);
514}
515
522 std::lock_guard lk(s_ggml_log_mu);
524}
525
526} // extern "C"
Concrete base class for inference backends (80% logic).
Definition backend.h:69
LlamaCppBackend — common llama.cpp patterns (15% layer).
Symbol visibility macro for all exported symbols.
entropic_error_t
Error codes returned by all C API functions.
Definition error.h:35
@ ENTROPIC_OK
Success.
Definition error.h:36
@ ENTROPIC_ERROR_INTERNAL
Unexpected internal error (bug)
Definition error.h:51
@ ENTROPIC_ERROR_GENERATE_FAILED
Generation failed (context overflow, model error)
Definition error.h:42
@ ENTROPIC_ERROR_LOAD_FAILED
Model load failed (corrupt file, OOM, unsupported format)
Definition error.h:41
Pure C interface contract for inference backends.
struct entropic_inference_backend * entropic_inference_backend_t
Opaque handle to an inference backend instance.
ENTROPIC_EXPORT entropic_error_t entropic_inference_complete(entropic_inference_backend_t backend, const char *prompt, const char *params_json, char **result_json)
Plugin C API: raw text completion without chat template.
static void ggml_log_to_file(enum ggml_log_level, const char *text, void *)
Callback that writes to the ggml log file.
ENTROPIC_EXPORT entropic_error_t entropic_inference_generate_streaming(entropic_inference_backend_t backend, const char *messages_json, const char *params_json, void(*on_token)(const char *token, size_t len, void *user_data), void *user_data, int *cancel_flag)
Plugin C API: streaming generation with token callback and cancel flag.
ENTROPIC_EXPORT entropic_error_t entropic_inference_generate(entropic_inference_backend_t backend, const char *messages_json, const char *params_json, char **result_json)
Plugin C API: blocking generation returning full result.
ENTROPIC_EXPORT int entropic_plugin_api_version()
Plugin API version.
static std::string canonicalize_or_passthrough(const char *path)
Resolve path via weakly_canonical, fall back to raw on error.
ENTROPIC_EXPORT entropic_error_t entropic_inference_unload(entropic_inference_backend_t backend)
Plugin C API: release the loaded model (transition to COLD).
void entropic_inference_log_silence(void)
Silence all llama/ggml output.
ENTROPIC_EXPORT int entropic_inference_count_tokens(entropic_inference_backend_t backend, const char *text, size_t text_len)
Plugin C API: count tokens for a text span (exact when loaded, estimate when COLD).
ENTROPIC_EXPORT entropic_error_t entropic_inference_deactivate(entropic_inference_backend_t backend)
Plugin C API: demote backend from ACTIVE to WARM (release GPU).
static void ggml_log_silence_locked()
Redirect llama/ggml logs to a file or silence them.
void entropic_inference_log_to_file(const char *path)
Redirect llama/ggml logs to a file or silence them.
ENTROPIC_EXPORT entropic_error_t entropic_inference_load(entropic_inference_backend_t backend, const char *config_json)
Plugin C API: load a model into the inference backend.
ENTROPIC_EXPORT entropic_inference_backend_t entropic_create_inference_backend()
Factory: create inference backend instance.
static void ggml_log_noop(enum ggml_log_level, const char *, void *)
No-op callback.
ENTROPIC_EXPORT void entropic_inference_free(void *ptr)
Plugin C API: free memory allocated by the inference backend.
ENTROPIC_EXPORT void entropic_inference_destroy(entropic_inference_backend_t backend)
Plugin C API: destroy the backend and free its resources.
ENTROPIC_EXPORT entropic_error_t entropic_inference_activate(entropic_inference_backend_t backend)
Plugin C API: promote backend from WARM to ACTIVE (GPU load).
ENTROPIC_EXPORT int entropic_inference_state(entropic_inference_backend_t backend)
Plugin C API: query current lifecycle state (lock-free).
LlamaCppBackend — llama.cpp C API integration.
spdlog initialization and logger access.
ENTROPIC_EXPORT std::shared_ptr< spdlog::logger > get(const std::string &name)
Get or create a named logger.
Definition logging.cpp:211
static char * alloc_string(const std::string &s)
Allocate a C string copy for caller-owned return.
Definition mcp_c_api.cpp:41
Shared parser: messages-JSON wire format → vector<Message>.
std::vector< Message > parse_messages_json(const char *json_str)
Parse a JSON array of messages into a vector of Message.
Generation parameters for a single inference call.
Definition config.h:227
std::string grammar
GBNF grammar string (empty = unconstrained)
Definition config.h:240
int top_k
Top-K sampling.
Definition config.h:230
float repeat_penalty
Repetition penalty.
Definition config.h:231
float temperature
Sampling temperature.
Definition config.h:228
int max_tokens
Maximum tokens to generate.
Definition config.h:232
float top_p
Nucleus sampling threshold.
Definition config.h:229
Result of a single generation call.
entropic_error_t error_code
Error code (ENTROPIC_OK if no error)
double generation_time_ms
Wall-clock generation time.
std::string finish_reason
Finish reason: "stop", "length", "error".
std::string content
Generated text (cleaned by adapter)
std::string error_message
Error description (empty if no error)
int token_count
Generated token count.
Model configuration for a single tier.
Definition config.h:148
int gpu_layers
GPU offload layers (-1 = all)
Definition config.h:152
int context_length
Context window size (512–131072)
Definition config.h:151
std::filesystem::path path
Resolved model file path.
Definition config.h:149
int n_threads
CPU threads (0 = auto-detect)
Definition config.h:161
bool keep_warm
Pre-warm model at startup.
Definition config.h:153
int n_batch
Batch size for prompt processing.
Definition config.h:160
bool flash_attn
Enable flash attention.
Definition config.h:163
bool use_mlock
Lock model in system RAM.
Definition config.h:154
std::string adapter
Chat adapter name.
Definition config.h:150