23namespace entropic::speculative {
29constexpr int kSpecVocabMaxSizeDifference = 128;
30constexpr int kSpecVocabCheckStartTokenId = 5;
56std::optional<std::string> check_arch_gate(
57 const llama_model* target) {
58 if (llama_model_is_recurrent(target)) {
60 "target model is recurrent (Mamba/RWKV) — speculative "
61 "decoding is incompatible with recurrent architectures "
62 "at the v2.1.11 llama.cpp pin"};
64 if (llama_model_is_hybrid(target)) {
66 "target model is hybrid (e.g., QWEN35/QWEN35MOE, "
67 "NEMOTRON_H, JAMBA, GRANITE_HYBRID) — speculative "
68 "decoding produces divergent state at split-prefill "
69 "boundaries on hybrid SSM architectures at the v2.1.11 "
70 "llama.cpp pin (see proposal Implementation Log, Gate A)"};
84std::optional<std::string> check_vocab_type(
85 const llama_vocab* vt,
const llama_vocab* vd) {
86 if (llama_vocab_type(vt) != llama_vocab_type(vd)) {
88 "vocab type differs between target and draft models"};
102std::optional<std::string> check_bos(
103 const llama_vocab* vt,
const llama_vocab* vd) {
104 const bool add_t = llama_vocab_get_add_bos(vt);
105 const bool add_d = llama_vocab_get_add_bos(vd);
106 if (add_t != add_d) {
107 return std::string{
"BOS add-behavior differs"};
109 if (add_t && llama_vocab_bos(vt) != llama_vocab_bos(vd)) {
110 return std::string{
"BOS token id differs"};
124std::optional<std::string> check_eos(
125 const llama_vocab* vt,
const llama_vocab* vd) {
126 const bool add_t = llama_vocab_get_add_eos(vt);
127 const bool add_d = llama_vocab_get_add_eos(vd);
128 if (add_t != add_d) {
129 return std::string{
"EOS add-behavior differs"};
131 if (add_t && llama_vocab_eos(vt) != llama_vocab_eos(vd)) {
132 return std::string{
"EOS token id differs"};
147std::optional<std::string> check_vocab_size(
148 const llama_vocab* vt,
const llama_vocab* vd) {
149 const int nt = llama_vocab_n_tokens(vt);
150 const int nd = llama_vocab_n_tokens(vd);
151 const int diff = (nt > nd) ? (nt - nd) : (nd - nt);
152 if (diff > kSpecVocabMaxSizeDifference) {
154 "vocab size difference exceeds the speculative tolerance "
155 "(target=" + std::to_string(nt)
156 +
", draft=" + std::to_string(nd)
157 +
", max-allowed-diff="
158 + std::to_string(kSpecVocabMaxSizeDifference) +
")"};
176std::optional<std::string> check_token_text(
177 const llama_vocab* vt,
const llama_vocab* vd) {
178 const int nt = llama_vocab_n_tokens(vt);
179 const int nd = llama_vocab_n_tokens(vd);
180 const int end = std::min(nt, nd);
181 for (
int i = kSpecVocabCheckStartTokenId; i < end; ++i) {
182 const char* tt = llama_vocab_get_text(vt, i);
183 const char* td = llama_vocab_get_text(vd, i);
184 if (tt ==
nullptr || td ==
nullptr || std::strcmp(tt, td) != 0) {
186 "token text differs at id "
188 +
" — draft tokenizer is not a prefix-compatible "
189 "subset of the target"};
213std::string run_vocab_checks(
214 const llama_vocab* vt,
const llama_vocab* vd) {
215 using Check = std::optional<std::string> (*)(
216 const llama_vocab*,
const llama_vocab*);
217 static constexpr Check checks[] = {
218 &check_vocab_type, &check_bos, &check_eos,
219 &check_vocab_size, &check_token_text,
222 for (Check fn : checks) {
225 if (d.has_value()) { err = std::move(*d); }
243std::string build_compat_diagnostic(
244 const llama_model* target,
const llama_model* draft) {
246 if (target ==
nullptr || draft ==
nullptr) {
247 err =
"null model handle (target or draft)";
248 }
else if (
auto d = check_arch_gate(target); d.has_value()) {
251 const llama_vocab* vt = llama_model_get_vocab(target);
252 const llama_vocab* vd = llama_model_get_vocab(draft);
253 err = (vt ==
nullptr || vd ==
nullptr)
254 ? std::string{
"model vocab unavailable (target or draft)"}
255 : run_vocab_checks(vt, vd);
272 const llama_model* target,
const llama_model* draft) {
273 std::string err = build_compat_diagnostic(target, draft);
Tokenizer/architecture compatibility check for speculative decoding draft pairing.
CompatResult check_compat(const llama_model *target, const llama_model *draft)
Check whether a draft model can pair with a target for sequential speculative decoding.
Result of a draft/target compatibility check.