diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-11 16:41:57 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-08-15 11:43:45 +0100 |
commit | 6a485f612fbda354c8c67e9ccc3c119356687354 (patch) | |
tree | 193fda6eb17b87d404501de0fbf028db6997ebce | |
parent | 83ca8c7e381c35ee61a957c1fc82a9db9c6db6fb (diff) | |
download | serenity-6a485f612fbda354c8c67e9ccc3c119356687354.zip |
LibRegex: Implement legacy octal escape parsing closer to the spec
The grammar for the ECMA-262 CharacterEscape is:
CharacterEscape[U, N] ::
ControlEscape
c ControlLetter
0 [lookahead ∉ DecimalDigit]
HexEscapeSequence
RegExpUnicodeEscapeSequence[?U]
[~U]LegacyOctalEscapeSequence
IdentityEscape[?U, ?N]
It's important to parse the standalone "\0 [lookahead ∉ DecimalDigit]"
before parsing LegacyOctalEscapeSequence. Otherwise, all standalone "\0"
patterns are parsed as octal, which are disallowed in Unicode mode.
Further, LegacyOctalEscapeSequence should also be parsed while parsing
character classes.
-rw-r--r-- | Tests/LibRegex/Regex.cpp | 20 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 32 |
2 files changed, 43 insertions, 9 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 36ac6f1498..2e8cce5c30 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -22,6 +22,12 @@ static PosixOptions match_test_api_options(const PosixOptions options) return options; } +template<typename... Flags> +static constexpr ECMAScriptFlags combine_flags(Flags&&... flags) requires((IsSame<Flags, ECMAScriptFlags> && ...)) +{ + return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...)); +} + TEST_CASE(regex_options_ecmascript) { ECMAScriptOptions eo; @@ -543,6 +549,14 @@ TEST_CASE(ECMA262_parse) { "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode }, { "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, { "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, }; for (auto& test : tests) { @@ -606,6 +620,12 @@ TEST_CASE(ECMA262_match) "return /xx/"sv, true, ECMAScriptFlags::BrowserExtended }, // #5517, appears to be matching JS expressions that involve regular expressions... { "a{2,}"sv, "aaaa"sv }, // #5518 + { "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended }, + { "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended }, + { "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended }, + { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended }, }; // clang-format on diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index a19faa02f0..a0393a6ac0 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -18,6 +18,7 @@ namespace regex { static constexpr size_t s_maximum_repetition_count = 1024 * 1024; static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv; +static constexpr auto s_decimal_characters = "0123456789"sv; ALWAYS_INLINE bool Parser::set_error(Error error) { @@ -1430,6 +1431,17 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini return true; } + // '\0' + if (try_skip("0")) { + if (!lookahead_any(s_decimal_characters)) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); + return true; + } + + back(); + } + // LegacyOctalEscapeSequence if (m_should_use_browser_extended_grammar) { if (!unicode) { @@ -1441,13 +1453,6 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini } } - // '\0' - if (try_skip("0")) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); - return true; - } - // HexEscape if (try_skip("x")) { if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) { @@ -1797,8 +1802,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& } // '\0' - if (try_skip("0")) - return { CharClassRangeElement { .code_point = 0, .is_character_class = false } }; + if (try_skip("0")) { + if (!lookahead_any(s_decimal_characters)) + return { CharClassRangeElement { .code_point = 0, .is_character_class = false } }; + back(); + } + + // LegacyOctalEscapeSequence + if (m_should_use_browser_extended_grammar && !unicode) { + if (auto escape = parse_legacy_octal_escape(); escape.has_value()) + return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } }; + } // HexEscape if (try_skip("x")) { |