AK: Invalidate UTF-8 encoded code points larger than U+10ffff

On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252 with the following contents: /ô¡°½/ The REPL assumes the input file is UTF-8. So in Windows-1252, the above is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are actually a valid UTF-8 encoding if we only look at the most significant bits to parse leading/continuation bytes. However, it decodes to the code point U+121c3d, which is not a valid code point. This commit adds additional validation to ensure the decoded code point itself is also valid.
author: Timothy Flynn <trflynn89@pm.me> 2022-04-03 09:03:14 -0400
committer: Linus Groh <mail@linusgroh.de> 2022-04-05 00:14:29 +0100
commit: 9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0 (patch)
tree: 4df1f5a9b97d671403c9919e885eef5289943e98 /AK/Utf8View.cpp
parent: 119873b8229914b9e709e446e4e5f1f5df83b13d (diff)
download: serenity-9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0.zip
1 files changed, 10 insertions, 3 deletions
diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp
index 7d93d73b78..effc7aae98 100644
--- a/AK/Utf8View.cpp
+++ b/AK/Utf8View.cpp
@@ -6,6 +6,7 @@
  */
 
 #include <AK/Assertions.h>
+#include <AK/CharacterTypes.h>
 #include <AK/Format.h>
 #include <AK/Utf8View.h>
 
@@ -100,9 +101,9 @@ bool Utf8View::validate(size_t& valid_bytes) const
 {
     valid_bytes = 0;
     for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
-        size_t code_point_length_in_bytes;
-        u32 value;
-        bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, value);
+        size_t code_point_length_in_bytes = 0;
+        u32 code_point = 0;
+        bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, code_point);
         if (!first_byte_makes_sense)
             return false;
 
@@ -112,8 +113,14 @@ bool Utf8View::validate(size_t& valid_bytes) const
                 return false;
             if (*ptr >> 6 != 2)
                 return false;
+
+            code_point <<= 6;
+            code_point |= *ptr & 63;
         }
 
+        if (!is_unicode(code_point))
+            return false;
+
         valid_bytes += code_point_length_in_bytes;
     }
author	Timothy Flynn <trflynn89@pm.me>	2022-04-03 09:03:14 -0400
committer	Linus Groh <mail@linusgroh.de>	2022-04-05 00:14:29 +0100
commit	9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0 (patch)
tree	4df1f5a9b97d671403c9919e885eef5289943e98 /AK/Utf8View.cpp
parent	119873b8229914b9e709e446e4e5f1f5df83b13d (diff)
download	serenity-9e5abec6f16ae5ded9374a4ba8de5300d6aaf7d0.zip