12namespace entropic::mcp {
16constexpr char kReplacement[] =
"\xEF\xBF\xBD";
17constexpr size_t kReplacementLen = 3;
24inline bool is_cont(uint8_t b) {
return (b & 0xC0) == 0x80; }
36inline int follow_count(uint8_t b) {
38 if (b < 0x80) { n = 0; }
39 else if (b >= 0xC2 && b < 0xE0) { n = 1; }
40 else if (b >= 0xE0 && b < 0xF0) { n = 2; }
41 else if (b >= 0xF0 && b < 0xF5) { n = 3; }
58inline bool first_cont_in_range(uint8_t lead, uint8_t c1) {
59 bool ok = is_cont(c1);
60 if (lead == 0xE0) {
ok =
ok && c1 >= 0xA0; }
61 else if (lead == 0xED) {
ok =
ok && c1 <= 0x9F; }
62 else if (lead == 0xF0) {
ok =
ok && c1 >= 0x90; }
63 else if (lead == 0xF4) {
ok =
ok && c1 <= 0x8F; }
76size_t valid_seq_len(
const uint8_t* p,
const uint8_t* end) {
77 int n = follow_count(*p);
80 || (p + n < end && first_cont_in_range(*p, p[1])));
81 for (
int i = 2;
ok && i <= n; ++i) {
84 return ok ?
static_cast<size_t>(n + 1) : 0;
98 out.reserve(input.size());
99 const auto* p =
reinterpret_cast<const uint8_t*
>(input.data());
100 const auto* end = p + input.size();
102 size_t len = valid_seq_len(p, end);
104 out.append(
reinterpret_cast<const char*
>(p), len);
107 out.append(kReplacement, kReplacementLen);
@ ok
Tool dispatched, returned non-empty content.
UTF-8 validation + replacement at every system boundary where bytes change ownership.
ENTROPIC_EXPORT std::string sanitize_utf8(std::string_view input)
Replace invalid UTF-8 byte sequences with U+FFFD.