Entropic 2.3.8
Local-first agentic inference engine
Loading...
Searching...
No Matches
utf8_sanitize.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
9
10#include <cstdint>
11
12namespace entropic::mcp {
13
14namespace {
15
16constexpr char kReplacement[] = "\xEF\xBF\xBD"; // U+FFFD
17constexpr size_t kReplacementLen = 3;
18
24inline bool is_cont(uint8_t b) { return (b & 0xC0) == 0x80; }
25
36inline int follow_count(uint8_t b) {
37 int n = -1;
38 if (b < 0x80) { n = 0; }
39 else if (b >= 0xC2 && b < 0xE0) { n = 1; }
40 else if (b >= 0xE0 && b < 0xF0) { n = 2; }
41 else if (b >= 0xF0 && b < 0xF5) { n = 3; }
42 return n;
43}
44
58inline bool first_cont_in_range(uint8_t lead, uint8_t c1) {
59 bool ok = is_cont(c1); // default upper-bound 0xBF, lower-bound 0x80
60 if (lead == 0xE0) { ok = ok && c1 >= 0xA0; }
61 else if (lead == 0xED) { ok = ok && c1 <= 0x9F; }
62 else if (lead == 0xF0) { ok = ok && c1 >= 0x90; }
63 else if (lead == 0xF4) { ok = ok && c1 <= 0x8F; }
64 return ok;
65}
66
76size_t valid_seq_len(const uint8_t* p, const uint8_t* end) {
77 int n = follow_count(*p);
78 bool ok = (n >= 0)
79 && (n == 0
80 || (p + n < end && first_cont_in_range(*p, p[1])));
81 for (int i = 2; ok && i <= n; ++i) {
82 ok = is_cont(p[i]);
83 }
84 return ok ? static_cast<size_t>(n + 1) : 0;
85}
86
87} // namespace
88
96std::string sanitize_utf8(std::string_view input) {
97 std::string out;
98 out.reserve(input.size());
99 const auto* p = reinterpret_cast<const uint8_t*>(input.data());
100 const auto* end = p + input.size();
101 while (p < end) {
102 size_t len = valid_seq_len(p, end);
103 if (len > 0) {
104 out.append(reinterpret_cast<const char*>(p), len);
105 p += len;
106 } else {
107 out.append(kReplacement, kReplacementLen);
108 ++p;
109 }
110 }
111 return out;
112}
113
114} // namespace entropic::mcp
@ ok
Tool dispatched, returned non-empty content.
UTF-8 validation + replacement at every system boundary where bytes change ownership.
ENTROPIC_EXPORT std::string sanitize_utf8(std::string_view input)
Replace invalid UTF-8 byte sequences with U+FFFD.