diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-20 10:22:23 -0400 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-08-20 19:16:33 +0200 |
commit | 562d4e497b286335b0ea956e4839e6dd9f140673 (patch) | |
tree | 24861043c7ecb441a67599389f2ebae80d9e17b0 | |
parent | 7c54b6bd45efbf3ba933e615131b3df6bbffef9f (diff) | |
download | serenity-562d4e497b286335b0ea956e4839e6dd9f140673.zip |
LibRegex: Treat pattern string characters as unsigned
For example, consider the following pattern:
new RegExp('\ud834\udf06', 'u')
With this pattern, the regex parser should insert the UTF-8 encoded
bytes 0xf0, 0x9d, 0x8c, and 0x86. However, because these characters are
currently treated as normal char types, they have a negative value since
they are all > 0x7f. Then, due to sign extension, when these characters
are cast to u64, the sign bit is preserved. The result is that these
bytes are inserted as 0xfffffffffffffff0, 0xffffffffffffff9d, etc.
Fortunately, there are only a few places where we insert bytecode with
the raw characters. In these places, be sure to treat the bytes as u8
before they are cast to u64.
-rw-r--r-- | Tests/LibRegex/Regex.cpp | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 12 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 2 |
3 files changed, 9 insertions, 7 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 7a14f30eb9..d7630fb25d 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -687,6 +687,8 @@ TEST_CASE(ECMA262_unicode_match) ECMAScriptFlags options {}; }; _test tests[] { + { "\xf0\x9d\x8c\x86"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode }, + { "[\xf0\x9d\x8c\x86]"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode }, { "\\ud83d"sv, "😀"sv, true }, { "\\ud83d"sv, "😀"sv, false, ECMAScriptFlags::Unicode }, { "\\ude00"sv, "😀"sv, true }, diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 02598f5da9..6e7f4eba8d 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -145,9 +145,9 @@ ALWAYS_INLINE bool Parser::lookahead_any(StringView str) return false; } -ALWAYS_INLINE char Parser::skip() +ALWAYS_INLINE unsigned char Parser::skip() { - char ch; + unsigned char ch; if (m_parser_state.current_token.value().length() == 1) { ch = m_parser_state.current_token.value()[0]; } else { @@ -1287,7 +1287,7 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo // Also part of AtomEscape. auto token = consume(); match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[1] } }); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[1] } }); return true; } if (try_skip("\\")) { @@ -1326,7 +1326,7 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo if (m_should_use_browser_extended_grammar) { auto token = consume(); match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } }); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } }); return true; } else { return false; @@ -1336,7 +1336,7 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo if (match_ordinary_characters()) { auto token = consume().value(); match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } }); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token[0] } }); return true; } @@ -1594,7 +1594,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini // Allow all SourceCharacter's as escapes here. auto token = consume(); match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } }); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } }); return true; } diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 480320037a..33bccca223 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -84,7 +84,7 @@ protected: ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode); ALWAYS_INLINE bool try_skip(StringView); ALWAYS_INLINE bool lookahead_any(StringView); - ALWAYS_INLINE char skip(); + ALWAYS_INLINE unsigned char skip(); ALWAYS_INLINE void back(size_t = 1); ALWAYS_INLINE void reset(); ALWAYS_INLINE bool done() const; |