diff options
author | Timothy Flynn <trflynn89@pm.me> | 2022-04-03 09:03:14 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2022-04-05 00:14:29 +0100 |
commit | 9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0 (patch) | |
tree | 4df1f5a9b97d671403c9919e885eef5289943e98 /AK/Utf8View.cpp | |
parent | 119873b8229914b9e709e446e4e5f1f5df83b13d (diff) | |
download | serenity-9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0.zip |
AK: Invalidate UTF-8 encoded code points larger than U+10ffff
On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:
/ô¡°½/
The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.
This commit adds additional validation to ensure the decoded code point
itself is also valid.
Diffstat (limited to 'AK/Utf8View.cpp')
-rw-r--r-- | AK/Utf8View.cpp | 13 |
1 files changed, 10 insertions, 3 deletions
diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp index 7d93d73b78..effc7aae98 100644 --- a/AK/Utf8View.cpp +++ b/AK/Utf8View.cpp @@ -6,6 +6,7 @@ */ #include <AK/Assertions.h> +#include <AK/CharacterTypes.h> #include <AK/Format.h> #include <AK/Utf8View.h> @@ -100,9 +101,9 @@ bool Utf8View::validate(size_t& valid_bytes) const { valid_bytes = 0; for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) { - size_t code_point_length_in_bytes; - u32 value; - bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, value); + size_t code_point_length_in_bytes = 0; + u32 code_point = 0; + bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, code_point); if (!first_byte_makes_sense) return false; @@ -112,8 +113,14 @@ bool Utf8View::validate(size_t& valid_bytes) const return false; if (*ptr >> 6 != 2) return false; + + code_point <<= 6; + code_point |= *ptr & 63; } + if (!is_unicode(code_point)) + return false; + valid_bytes += code_point_length_in_bytes; } |