summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAli Mohammad Pur <ali.mpfard@gmail.com>2022-01-21 14:30:47 +0330
committerAli Mohammad Pur <Ali.mpfard@gmail.com>2022-01-21 18:14:08 +0330
commitc11be92e23d899e28d45f67be24e47b2e5114d3a (patch)
treea3635341f3d99cf05308d0d6d94a672b1d76622f
parentbfe8f312f3674d83f25191ee8acbb71af222697a (diff)
downloadserenity-c11be92e23d899e28d45f67be24e47b2e5114d3a.zip
LibRegex: Implement an ECMA262 Regex quirk with negative lookarounds
This implements the quirk defined by "Note 3" in section "Canonicalize" (https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch). Crosses off another quirk from #6042.
-rw-r--r--Tests/LibRegex/Regex.cpp1
-rw-r--r--Userland/Libraries/LibRegex/RegexMatcher.cpp15
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp40
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.h14
4 files changed, 54 insertions, 16 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp
index 3123472bc8..d054f467e5 100644
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@@ -682,6 +682,7 @@ TEST_CASE(ECMA262_match)
{ "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
{ "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
{ "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
+ { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
};
// clang-format on
diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp
index b065bc146b..26ae07d886 100644
--- a/Userland/Libraries/LibRegex/RegexMatcher.cpp
+++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp
@@ -486,7 +486,14 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
return true;
case ExecutionResult::Failed:
if (!states_to_try_next.is_empty()) {
- state = states_to_try_next.take_last();
+ auto next_state = states_to_try_next.take_last();
+ // Note: ECMA262 quirk: Note 3, https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
+ // capture groups defined in lookarounds "leak" outside the regex,
+ // but their contents are empty if the lookaround fails.
+ // This is done by manually clearing the groups where needed, and leaking their contents here.
+ if constexpr (IsSame<Parser, ECMA262>)
+ swap(next_state.capture_group_matches, state.capture_group_matches);
+ state = move(next_state);
continue;
}
return false;
@@ -496,7 +503,11 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
return {};
return false;
}
- state = states_to_try_next.take_last();
+ auto next_state = states_to_try_next.take_last();
+ // See note above about an ECMA262 quirk.
+ if constexpr (IsSame<Parser, ECMA262>)
+ swap(next_state.capture_group_matches, state.capture_group_matches);
+ state = move(next_state);
++recursion_level;
continue;
}
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 5fc6aeb1c4..61b5ff1843 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -10,6 +10,7 @@
#include <AK/AnyOf.h>
#include <AK/CharacterTypes.h>
#include <AK/GenericLexer.h>
+#include <AK/ScopeGuard.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
@@ -1065,9 +1066,17 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma
return true;
}
if (should_parse_forward_assertion && try_skip("!")) {
+ enter_capture_group_scope();
+ ScopeGuard quit_scope {
+ [this] {
+ exit_capture_group_scope();
+ }
+ };
if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
return false;
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
+ clear_all_capture_groups_in_scope(stack);
+
return true;
}
if (m_should_use_browser_extended_grammar) {
@@ -1086,9 +1095,16 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma
return true;
}
if (try_skip("<!")) {
+ enter_capture_group_scope();
+ ScopeGuard quit_scope {
+ [this] {
+ exit_capture_group_scope();
+ }
+ };
if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
return false;
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
+ clear_all_capture_groups_in_scope(stack);
return true;
}
@@ -1124,10 +1140,17 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool
return true;
}
if (try_skip("!")) {
+ enter_capture_group_scope();
+ ScopeGuard quit_scope {
+ [this] {
+ exit_capture_group_scope();
+ }
+ };
if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
return false;
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
+ clear_all_capture_groups_in_scope(stack);
return true;
}
@@ -2189,20 +2212,9 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
{
consume(TokenType::LeftParen, Error::InvalidPattern);
- auto enter_capture_group_scope = [&] {
- m_capture_groups_in_scope.empend();
- };
- auto exit_capture_group_scope = [&] {
- auto last = m_capture_groups_in_scope.take_last();
- m_capture_groups_in_scope.last().extend(move(last));
- };
auto register_capture_group_in_current_scope = [&](auto identifier) {
m_capture_groups_in_scope.last().empend(identifier);
};
- auto clear_all_capture_groups_in_scope = [&] {
- for (auto& index : m_capture_groups_in_scope.last())
- stack.insert_bytecode_clear_capture_group(index);
- };
if (match(TokenType::Questionmark)) {
// Non-capturing group or group with specifier.
@@ -2216,7 +2228,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
enter_capture_group_scope();
if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
- clear_all_capture_groups_in_scope();
+ clear_all_capture_groups_in_scope(stack);
exit_capture_group_scope();
consume(TokenType::RightParen, Error::MismatchingParen);
@@ -2246,7 +2258,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
enter_capture_group_scope();
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
- clear_all_capture_groups_in_scope();
+ clear_all_capture_groups_in_scope(stack);
exit_capture_group_scope();
register_capture_group_in_current_scope(group_index);
@@ -2277,7 +2289,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
- clear_all_capture_groups_in_scope();
+ clear_all_capture_groups_in_scope(stack);
exit_capture_group_scope();
register_capture_group_in_current_scope(group_index);
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index eb66221f21..b7b3691835 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -254,6 +254,20 @@ private:
size_t ensure_total_number_of_capturing_parenthesis();
+ void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
+
+ void exit_capture_group_scope()
+ {
+ auto last = m_capture_groups_in_scope.take_last();
+ m_capture_groups_in_scope.last().extend(move(last));
+ }
+
+ void clear_all_capture_groups_in_scope(ByteCode& stack)
+ {
+ for (auto& index : m_capture_groups_in_scope.last())
+ stack.insert_bytecode_clear_capture_group(index);
+ };
+
// ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
// always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
// parse it twice, so we'll just do so when it's actually needed.