diff --git a/libc/str/isutf8.c b/libc/str/isutf8.c index 6c9a6334eaa..7dea499df04 100644 --- a/libc/str/isutf8.c +++ b/libc/str/isutf8.c @@ -27,8 +27,8 @@ static const char kUtf8Dispatch[] = { 1, 1, 1, 1, 1, 1, 1, 1, // 0320 1, 1, 1, 1, 1, 1, 1, 1, // 0330 2, 3, 3, 3, 3, 3, 3, 3, // 0340 utf8-3 - 3, 3, 3, 3, 3, 3, 3, 3, // 0350 - 4, 5, 5, 5, 5, 0, 0, 0, // 0360 utf8-4 + 3, 3, 3, 3, 3, 4, 3, 3, // 0350 + 5, 6, 6, 6, 7, 0, 0, 0, // 0360 utf8-4 0, 0, 0, 0, 0, 0, 0, 0, // 0370 }; @@ -94,6 +94,7 @@ bool32 isutf8(const void *data, size_t size) { } // fallthrough case 3: + case_utf8_3: if (p + 2 <= e && // (p[0] & 0300) == 0200 && // (p[1] & 0300) == 0200) { // @@ -103,11 +104,17 @@ bool32 isutf8(const void *data, size_t size) { return false; // missing cont } case 4: + if (p < e && (*p & 040)) { + return false; // utf-16 surrogate + } + goto case_utf8_3; + case 5: if (p < e && (*p & 0377) < 0220) { return false; // overlong } // fallthrough - case 5: + case 6: + case_utf8_4: if (p + 3 <= e && // (((uint32_t)(p[+2] & 0377) << 030 | // (uint32_t)(p[+1] & 0377) << 020 | // @@ -119,6 +126,11 @@ bool32 isutf8(const void *data, size_t size) { } else { return false; // missing cont } + case 7: + if (p < e && (*p & 0x3F) > 0xF) { + return false; // over limit + } + goto case_utf8_4; default: __builtin_unreachable(); } diff --git a/test/libc/str/isutf8_test.c b/test/libc/str/isutf8_test.c index 0f3a162e415..0d11019e812 100644 --- a/test/libc/str/isutf8_test.c +++ b/test/libc/str/isutf8_test.c @@ -39,6 +39,9 @@ TEST(isutf8, good) { "剑号巨阙 珠称夜光 果珍李柰 菜重芥姜 海咸河淡 鳞潜羽翔" "龙师火帝 鸟官人皇 始制文字 乃服衣裳 推位让国 有虞陶唐", -1)); + EXPECT_TRUE(isutf8("\xf4\x8f\xbf\xbf", -1)); + EXPECT_TRUE(isutf8("\xed\x9f\xbf", -1)); + EXPECT_TRUE(isutf8("\xee\x80\x80", -1)); } TEST(isutf8, bad) { @@ -46,6 +49,9 @@ TEST(isutf8, bad) { ASSERT_FALSE(isutf8("\200\300", -1)); // latin1 c1 control code ASSERT_FALSE(isutf8("\300\300", -1)); // missing continuation ASSERT_FALSE(isutf8("\377\200\200\200\200", -1)); // thompson-pike varint + ASSERT_FALSE(isutf8("\xf4\x90\x80\x80", -1)); // over limit + ASSERT_FALSE(isutf8("\xed\xa0\x80", -1)); + ASSERT_FALSE(isutf8("\xed\xbf\xbf", -1)); // surrogate pairs } TEST(isutf8, oob) {