summaryrefslogtreecommitdiff
path: root/Userland
diff options
context:
space:
mode:
authorAli Mohammad Pur <ali.mpfard@gmail.com>2021-07-23 19:37:18 +0430
committerAli Mohammad Pur <Ali.mpfard@gmail.com>2021-07-23 21:19:21 +0430
commitc8b2199251ff710f8c0b4387bc7eb53803a8a26e (patch)
tree66067b56a0ca159b00e412e4fcde043c950c4427 /Userland
parent34ec0fa8ad03de233c64578f7cb60ebdbf88e826 (diff)
downloadserenity-c8b2199251ff710f8c0b4387bc7eb53803a8a26e.zip
LibRegex: Clear previous capture group contents in ECMA262 mode
ECMA262 requires that the capture groups only contain the values from the last iteration, e.g. `((c)(a)?(b))` should _not_ contain 'a' in the second capture group when matching "cabcb".
Diffstat (limited to 'Userland')
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.cpp25
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.h37
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp40
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.h8
4 files changed, 109 insertions, 1 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index 259aaf26af..05c7c63a17 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -132,6 +132,12 @@ void ByteCode::ensure_opcodes_initialized()
case OpCodeId::CheckBegin:
s_opcodes[i] = make<OpCode_CheckBegin>();
break;
+ case OpCodeId::ClearCaptureGroup:
+ s_opcodes[i] = make<OpCode_ClearCaptureGroup>();
+ break;
+ case OpCodeId::ClearNamedCaptureGroup:
+ s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>();
+ break;
case OpCodeId::SaveLeftCaptureGroup:
s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>();
break;
@@ -288,6 +294,16 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input,
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
+ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+ if (input.match_index < state.capture_group_matches.size()) {
+ auto& group = state.capture_group_matches[input.match_index];
+ if (id() < group.size())
+ group[id()] = {};
+ }
+ return ExecutionResult::Continue;
+}
+
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (input.match_index >= state.capture_group_matches.size()) {
@@ -333,6 +349,15 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI
return ExecutionResult::Continue;
}
+ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+ if (input.match_index < state.capture_group_matches.size()) {
+ auto& group = state.named_capture_group_matches[input.match_index];
+ group.remove(name());
+ }
+ return ExecutionResult::Continue;
+}
+
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (input.match_index >= state.named_capture_group_matches.size()) {
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
index c696ace0f8..a897fe4d84 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -39,6 +39,8 @@ using ByteCodeValueType = u64;
__ENUMERATE_OPCODE(Save) \
__ENUMERATE_OPCODE(Restore) \
__ENUMERATE_OPCODE(GoBack) \
+ __ENUMERATE_OPCODE(ClearCaptureGroup) \
+ __ENUMERATE_OPCODE(ClearNamedCaptureGroup) \
__ENUMERATE_OPCODE(Exit)
// clang-format off
@@ -174,6 +176,19 @@ public:
extend(move(bytecode));
}
+ void insert_bytecode_clear_capture_group(size_t index)
+ {
+ empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
+ empend(index);
+ }
+
+ void insert_bytecode_clear_named_capture_group(StringView name)
+ {
+ empend(static_cast<ByteCodeValueType>(OpCodeId::ClearNamedCaptureGroup));
+ empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
+ empend(name.length());
+ }
+
void insert_bytecode_compare_string(StringView view)
{
ByteCode bytecode;
@@ -626,6 +641,28 @@ public:
const String arguments_string() const override { return String::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); }
};
+class OpCode_ClearCaptureGroup final : public OpCode {
+public:
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; }
+ ALWAYS_INLINE size_t size() const override { return 2; }
+ ALWAYS_INLINE size_t id() const { return argument(0); }
+ const String arguments_string() const override { return String::formatted("id={}", id()); }
+};
+
+class OpCode_ClearNamedCaptureGroup final : public OpCode {
+public:
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; }
+ ALWAYS_INLINE size_t size() const override { return 3; }
+ ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
+ ALWAYS_INLINE size_t length() const { return argument(1); }
+ const String arguments_string() const override
+ {
+ return String::formatted("name={}, length={}", name(), length());
+ }
+};
+
class OpCode_SaveLeftCaptureGroup final : public OpCode {
public:
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 5d8537d168..ae5c2e98f8 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -1877,6 +1877,28 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
{
consume(TokenType::LeftParen, Error::InvalidPattern);
+ auto enter_capture_group_scope = [&] {
+ m_capture_groups_in_scope.empend();
+ };
+ auto exit_capture_group_scope = [&] {
+ auto last = m_capture_groups_in_scope.take_last();
+ m_capture_groups_in_scope.last().extend(move(last));
+ };
+ auto register_capture_group_in_current_scope = [&](auto identifier) {
+ m_capture_groups_in_scope.last().empend(identifier);
+ };
+ auto clear_all_capture_groups_in_scope = [&] {
+ for (auto& entry : m_capture_groups_in_scope.last()) {
+ entry.visit(
+ [&](size_t index) {
+ stack.insert_bytecode_clear_capture_group(index);
+ },
+ [&](String const& name) {
+ stack.insert_bytecode_clear_named_capture_group(name);
+ });
+ }
+ };
+
if (match(TokenType::Questionmark)) {
// Non-capturing group or group with specifier.
consume();
@@ -1885,8 +1907,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
consume();
ByteCode noncapture_group_bytecode;
size_t length = 0;
+
+ enter_capture_group_scope();
if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
+ clear_all_capture_groups_in_scope();
+ exit_capture_group_scope();
consume(TokenType::RightParen, Error::MismatchingParen);
@@ -1907,8 +1933,14 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
ByteCode capture_group_bytecode;
size_t length = 0;
+ enter_capture_group_scope();
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
+ clear_all_capture_groups_in_scope();
+ exit_capture_group_scope();
+
+ register_capture_group_in_current_scope(name);
+ register_capture_group_in_current_scope(group_index);
consume(TokenType::RightParen, Error::MismatchingParen);
@@ -1930,7 +1962,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
}
auto group_index = ++m_parser_state.capture_groups_count;
- stack.insert_bytecode_group_capture_left(group_index);
+ enter_capture_group_scope();
ByteCode capture_group_bytecode;
size_t length = 0;
@@ -1938,6 +1970,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
+ clear_all_capture_groups_in_scope();
+ exit_capture_group_scope();
+
+ register_capture_group_in_current_scope(group_index);
+
+ stack.insert_bytecode_group_capture_left(group_index);
stack.extend(move(capture_group_bytecode));
m_parser_state.capture_group_minimum_lengths.set(group_index, length);
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index 5be99edbba..edfd64e3a5 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -190,12 +190,14 @@ public:
explicit ECMA262Parser(Lexer& lexer)
: Parser(lexer)
{
+ m_capture_groups_in_scope.empend();
}
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
: Parser(lexer, regex_options.value_or({}))
{
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
+ m_capture_groups_in_scope.empend();
}
~ECMA262Parser() = default;
@@ -242,6 +244,12 @@ private:
// Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
bool m_should_use_browser_extended_grammar { false };
+
+ // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
+ // by requiring that (...)+ only contain the matches for the last iteration.
+ // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
+ using CaptureGroup = Variant<size_t, String>;
+ Vector<Vector<CaptureGroup>> m_capture_groups_in_scope;
};
using PosixExtended = PosixExtendedParser;