diff options
author | Ali Mohammad Pur <ali.mpfard@gmail.com> | 2022-01-21 14:30:47 +0330 |
---|---|---|
committer | Ali Mohammad Pur <Ali.mpfard@gmail.com> | 2022-01-21 18:14:08 +0330 |
commit | c11be92e23d899e28d45f67be24e47b2e5114d3a (patch) | |
tree | a3635341f3d99cf05308d0d6d94a672b1d76622f | |
parent | bfe8f312f3674d83f25191ee8acbb71af222697a (diff) | |
download | serenity-c11be92e23d899e28d45f67be24e47b2e5114d3a.zip |
LibRegex: Implement an ECMA262 Regex quirk with negative lookarounds
This implements the quirk defined by "Note 3" in section "Canonicalize"
(https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch).
Crosses off another quirk from #6042.
-rw-r--r-- | Tests/LibRegex/Regex.cpp | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatcher.cpp | 15 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 40 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 14 |
4 files changed, 54 insertions, 16 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 3123472bc8..d054f467e5 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -682,6 +682,7 @@ TEST_CASE(ECMA262_match) { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended }, { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern + { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails. }; // clang-format on diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index b065bc146b..26ae07d886 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -486,7 +486,14 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta return true; case ExecutionResult::Failed: if (!states_to_try_next.is_empty()) { - state = states_to_try_next.take_last(); + auto next_state = states_to_try_next.take_last(); + // Note: ECMA262 quirk: Note 3, https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch + // capture groups defined in lookarounds "leak" outside the regex, + // but their contents are empty if the lookaround fails. + // This is done by manually clearing the groups where needed, and leaking their contents here. + if constexpr (IsSame<Parser, ECMA262>) + swap(next_state.capture_group_matches, state.capture_group_matches); + state = move(next_state); continue; } return false; @@ -496,7 +503,11 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta return {}; return false; } - state = states_to_try_next.take_last(); + auto next_state = states_to_try_next.take_last(); + // See note above about an ECMA262 quirk. + if constexpr (IsSame<Parser, ECMA262>) + swap(next_state.capture_group_matches, state.capture_group_matches); + state = move(next_state); ++recursion_level; continue; } diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 5fc6aeb1c4..61b5ff1843 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -10,6 +10,7 @@ #include <AK/AnyOf.h> #include <AK/CharacterTypes.h> #include <AK/GenericLexer.h> +#include <AK/ScopeGuard.h> #include <AK/String.h> #include <AK/StringBuilder.h> #include <AK/StringUtils.h> @@ -1065,9 +1066,17 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma return true; } if (should_parse_forward_assertion && try_skip("!")) { + enter_capture_group_scope(); + ScopeGuard quit_scope { + [this] { + exit_capture_group_scope(); + } + }; if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named)) return false; stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead); + clear_all_capture_groups_in_scope(stack); + return true; } if (m_should_use_browser_extended_grammar) { @@ -1086,9 +1095,16 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma return true; } if (try_skip("<!")) { + enter_capture_group_scope(); + ScopeGuard quit_scope { + [this] { + exit_capture_group_scope(); + } + }; if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named)) return false; stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy); + clear_all_capture_groups_in_scope(stack); return true; } @@ -1124,10 +1140,17 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool return true; } if (try_skip("!")) { + enter_capture_group_scope(); + ScopeGuard quit_scope { + [this] { + exit_capture_group_scope(); + } + }; if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named)) return false; stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead); + clear_all_capture_groups_in_scope(stack); return true; } @@ -2189,20 +2212,9 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi { consume(TokenType::LeftParen, Error::InvalidPattern); - auto enter_capture_group_scope = [&] { - m_capture_groups_in_scope.empend(); - }; - auto exit_capture_group_scope = [&] { - auto last = m_capture_groups_in_scope.take_last(); - m_capture_groups_in_scope.last().extend(move(last)); - }; auto register_capture_group_in_current_scope = [&](auto identifier) { m_capture_groups_in_scope.last().empend(identifier); }; - auto clear_all_capture_groups_in_scope = [&] { - for (auto& index : m_capture_groups_in_scope.last()) - stack.insert_bytecode_clear_capture_group(index); - }; if (match(TokenType::Questionmark)) { // Non-capturing group or group with specifier. @@ -2216,7 +2228,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi enter_capture_group_scope(); if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named)) return set_error(Error::InvalidPattern); - clear_all_capture_groups_in_scope(); + clear_all_capture_groups_in_scope(stack); exit_capture_group_scope(); consume(TokenType::RightParen, Error::MismatchingParen); @@ -2246,7 +2258,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi enter_capture_group_scope(); if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) return set_error(Error::InvalidPattern); - clear_all_capture_groups_in_scope(); + clear_all_capture_groups_in_scope(stack); exit_capture_group_scope(); register_capture_group_in_current_scope(group_index); @@ -2277,7 +2289,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) return set_error(Error::InvalidPattern); - clear_all_capture_groups_in_scope(); + clear_all_capture_groups_in_scope(stack); exit_capture_group_scope(); register_capture_group_in_current_scope(group_index); diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index eb66221f21..b7b3691835 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -254,6 +254,20 @@ private: size_t ensure_total_number_of_capturing_parenthesis(); + void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); } + + void exit_capture_group_scope() + { + auto last = m_capture_groups_in_scope.take_last(); + m_capture_groups_in_scope.last().extend(move(last)); + } + + void clear_all_capture_groups_in_scope(ByteCode& stack) + { + for (auto& index : m_capture_groups_in_scope.last()) + stack.insert_bytecode_clear_capture_group(index); + }; + // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always // parse it twice, so we'll just do so when it's actually needed. |