Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
speculative_compat.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
15
16#include <llama.h>
17
18#include <algorithm>
19#include <cstring>
20#include <optional>
21#include <string>
22
23namespace entropic::speculative {
24
25namespace {
26
27// Mirrors llama.cpp's constants. Keep in sync with
28// extern/llama.cpp/common/speculative.cpp if upstream tightens.
29constexpr int kSpecVocabMaxSizeDifference = 128;
30constexpr int kSpecVocabCheckStartTokenId = 5;
31
56std::optional<std::string> check_arch_gate(
57 const llama_model* target) {
58 if (llama_model_is_recurrent(target)) {
59 return std::string{
60 "target model is recurrent (Mamba/RWKV) — speculative "
61 "decoding is incompatible with recurrent architectures "
62 "at the v2.1.11 llama.cpp pin"};
63 }
64 if (llama_model_is_hybrid(target)) {
65 return std::string{
66 "target model is hybrid (e.g., QWEN35/QWEN35MOE, "
67 "NEMOTRON_H, JAMBA, GRANITE_HYBRID) — speculative "
68 "decoding produces divergent state at split-prefill "
69 "boundaries on hybrid SSM architectures at the v2.1.11 "
70 "llama.cpp pin (see proposal Implementation Log, Gate A)"};
71 }
72 return std::nullopt;
73}
74
84std::optional<std::string> check_vocab_type(
85 const llama_vocab* vt, const llama_vocab* vd) {
86 if (llama_vocab_type(vt) != llama_vocab_type(vd)) {
87 return std::string{
88 "vocab type differs between target and draft models"};
89 }
90 return std::nullopt;
91}
92
102std::optional<std::string> check_bos(
103 const llama_vocab* vt, const llama_vocab* vd) {
104 const bool add_t = llama_vocab_get_add_bos(vt);
105 const bool add_d = llama_vocab_get_add_bos(vd);
106 if (add_t != add_d) {
107 return std::string{"BOS add-behavior differs"};
108 }
109 if (add_t && llama_vocab_bos(vt) != llama_vocab_bos(vd)) {
110 return std::string{"BOS token id differs"};
111 }
112 return std::nullopt;
113}
114
124std::optional<std::string> check_eos(
125 const llama_vocab* vt, const llama_vocab* vd) {
126 const bool add_t = llama_vocab_get_add_eos(vt);
127 const bool add_d = llama_vocab_get_add_eos(vd);
128 if (add_t != add_d) {
129 return std::string{"EOS add-behavior differs"};
130 }
131 if (add_t && llama_vocab_eos(vt) != llama_vocab_eos(vd)) {
132 return std::string{"EOS token id differs"};
133 }
134 return std::nullopt;
135}
136
147std::optional<std::string> check_vocab_size(
148 const llama_vocab* vt, const llama_vocab* vd) {
149 const int nt = llama_vocab_n_tokens(vt);
150 const int nd = llama_vocab_n_tokens(vd);
151 const int diff = (nt > nd) ? (nt - nd) : (nd - nt);
152 if (diff > kSpecVocabMaxSizeDifference) {
153 return std::string{
154 "vocab size difference exceeds the speculative tolerance "
155 "(target=" + std::to_string(nt)
156 + ", draft=" + std::to_string(nd)
157 + ", max-allowed-diff="
158 + std::to_string(kSpecVocabMaxSizeDifference) + ")"};
159 }
160 return std::nullopt;
161}
162
176std::optional<std::string> check_token_text(
177 const llama_vocab* vt, const llama_vocab* vd) {
178 const int nt = llama_vocab_n_tokens(vt);
179 const int nd = llama_vocab_n_tokens(vd);
180 const int end = std::min(nt, nd);
181 for (int i = kSpecVocabCheckStartTokenId; i < end; ++i) {
182 const char* tt = llama_vocab_get_text(vt, i);
183 const char* td = llama_vocab_get_text(vd, i);
184 if (tt == nullptr || td == nullptr || std::strcmp(tt, td) != 0) {
185 return std::string{
186 "token text differs at id "
187 + std::to_string(i)
188 + " — draft tokenizer is not a prefix-compatible "
189 "subset of the target"};
190 }
191 }
192 return std::nullopt;
193}
194
195} // anonymous namespace
196
197namespace {
198
213std::string run_vocab_checks(
214 const llama_vocab* vt, const llama_vocab* vd) {
215 using Check = std::optional<std::string> (*)(
216 const llama_vocab*, const llama_vocab*);
217 static constexpr Check checks[] = {
218 &check_vocab_type, &check_bos, &check_eos,
219 &check_vocab_size, &check_token_text,
220 };
221 std::string err;
222 for (Check fn : checks) {
223 if (err.empty()) {
224 auto d = fn(vt, vd);
225 if (d.has_value()) { err = std::move(*d); }
226 }
227 }
228 return err;
229}
230
243std::string build_compat_diagnostic(
244 const llama_model* target, const llama_model* draft) {
245 std::string err;
246 if (target == nullptr || draft == nullptr) {
247 err = "null model handle (target or draft)";
248 } else if (auto d = check_arch_gate(target); d.has_value()) {
249 err = std::move(*d);
250 } else {
251 const llama_vocab* vt = llama_model_get_vocab(target);
252 const llama_vocab* vd = llama_model_get_vocab(draft);
253 err = (vt == nullptr || vd == nullptr)
254 ? std::string{"model vocab unavailable (target or draft)"}
255 : run_vocab_checks(vt, vd);
256 }
257 return err;
258}
259
260} // anonymous namespace
261
272 const llama_model* target, const llama_model* draft) {
273 std::string err = build_compat_diagnostic(target, draft);
274 return CompatResult{err.empty(), std::move(err)};
275}
276
277} // namespace entropic::speculative
Tokenizer/architecture compatibility check for speculative decoding draft pairing.
CompatResult check_compat(const llama_model *target, const llama_model *draft)
Check whether a draft model can pair with a target for sequential speculative decoding.
Result of a draft/target compatibility check.