diff options
author | Timothy Flynn <trflynn89@pm.me> | 2022-04-03 09:03:14 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2022-04-05 00:14:29 +0100 |
commit | 9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0 (patch) | |
tree | 4df1f5a9b97d671403c9919e885eef5289943e98 /Tests | |
parent | 119873b8229914b9e709e446e4e5f1f5df83b13d (diff) | |
download | serenity-9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0.zip |
AK: Invalidate UTF-8 encoded code points larger than U+10ffff
On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:
/ô¡°½/
The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.
This commit adds additional validation to ensure the decoded code point
itself is also valid.
Diffstat (limited to 'Tests')
-rw-r--r-- | Tests/AK/TestUtf8.cpp | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/Tests/AK/TestUtf8.cpp b/Tests/AK/TestUtf8.cpp index 7139c06aca..f8e4867258 100644 --- a/Tests/AK/TestUtf8.cpp +++ b/Tests/AK/TestUtf8.cpp @@ -70,6 +70,16 @@ TEST_CASE(validate_invalid_ut8) Utf8View utf8_4 { StringView { invalid_utf8_4 } }; EXPECT(!utf8_4.validate(valid_bytes)); EXPECT(valid_bytes == 0); + + char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0, 0 }; // U+110000 + Utf8View utf8_5 { StringView { invalid_utf8_5 } }; + EXPECT(!utf8_5.validate(valid_bytes)); + EXPECT(valid_bytes == 0); + + char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd, 0 }; // U+121c3d + Utf8View utf8_6 { StringView { invalid_utf8_6 } }; + EXPECT(!utf8_6.validate(valid_bytes)); + EXPECT(valid_bytes == 0); } TEST_CASE(iterate_utf8) |