diff options
author | Ali Mohammad Pur <ali.mpfard@gmail.com> | 2021-07-23 19:37:18 +0430 |
---|---|---|
committer | Ali Mohammad Pur <Ali.mpfard@gmail.com> | 2021-07-23 21:19:21 +0430 |
commit | c8b2199251ff710f8c0b4387bc7eb53803a8a26e (patch) | |
tree | 66067b56a0ca159b00e412e4fcde043c950c4427 /Userland | |
parent | 34ec0fa8ad03de233c64578f7cb60ebdbf88e826 (diff) | |
download | serenity-c8b2199251ff710f8c0b4387bc7eb53803a8a26e.zip |
LibRegex: Clear previous capture group contents in ECMA262 mode
ECMA262 requires that the capture groups only contain the values from
the last iteration, e.g. `((c)(a)?(b))` should _not_ contain 'a' in the
second capture group when matching "cabcb".
Diffstat (limited to 'Userland')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 25 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.h | 37 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 40 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 8 |
4 files changed, 109 insertions, 1 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 259aaf26af..05c7c63a17 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -132,6 +132,12 @@ void ByteCode::ensure_opcodes_initialized() case OpCodeId::CheckBegin: s_opcodes[i] = make<OpCode_CheckBegin>(); break; + case OpCodeId::ClearCaptureGroup: + s_opcodes[i] = make<OpCode_ClearCaptureGroup>(); + break; + case OpCodeId::ClearNamedCaptureGroup: + s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>(); + break; case OpCodeId::SaveLeftCaptureGroup: s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>(); break; @@ -288,6 +294,16 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, return ExecutionResult::Failed_ExecuteLowPrioForks; } +ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + if (input.match_index < state.capture_group_matches.size()) { + auto& group = state.capture_group_matches[input.match_index]; + if (id() < group.size()) + group[id()] = {}; + } + return ExecutionResult::Continue; +} + ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { if (input.match_index >= state.capture_group_matches.size()) { @@ -333,6 +349,15 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI return ExecutionResult::Continue; } +ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + if (input.match_index < state.capture_group_matches.size()) { + auto& group = state.named_capture_group_matches[input.match_index]; + group.remove(name()); + } + return ExecutionResult::Continue; +} + ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { if (input.match_index >= state.named_capture_group_matches.size()) { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index c696ace0f8..a897fe4d84 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -39,6 +39,8 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(Save) \ __ENUMERATE_OPCODE(Restore) \ __ENUMERATE_OPCODE(GoBack) \ + __ENUMERATE_OPCODE(ClearCaptureGroup) \ + __ENUMERATE_OPCODE(ClearNamedCaptureGroup) \ __ENUMERATE_OPCODE(Exit) // clang-format off @@ -174,6 +176,19 @@ public: extend(move(bytecode)); } + void insert_bytecode_clear_capture_group(size_t index) + { + empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup)); + empend(index); + } + + void insert_bytecode_clear_named_capture_group(StringView name) + { + empend(static_cast<ByteCodeValueType>(OpCodeId::ClearNamedCaptureGroup)); + empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); + empend(name.length()); + } + void insert_bytecode_compare_string(StringView view) { ByteCode bytecode; @@ -626,6 +641,28 @@ public: const String arguments_string() const override { return String::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); } }; +class OpCode_ClearCaptureGroup final : public OpCode { +public: + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE size_t id() const { return argument(0); } + const String arguments_string() const override { return String::formatted("id={}", id()); } +}; + +class OpCode_ClearNamedCaptureGroup final : public OpCode { +public: + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; } + ALWAYS_INLINE size_t size() const override { return 3; } + ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; } + ALWAYS_INLINE size_t length() const { return argument(1); } + const String arguments_string() const override + { + return String::formatted("name={}, length={}", name(), length()); + } +}; + class OpCode_SaveLeftCaptureGroup final : public OpCode { public: ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 5d8537d168..ae5c2e98f8 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1877,6 +1877,28 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi { consume(TokenType::LeftParen, Error::InvalidPattern); + auto enter_capture_group_scope = [&] { + m_capture_groups_in_scope.empend(); + }; + auto exit_capture_group_scope = [&] { + auto last = m_capture_groups_in_scope.take_last(); + m_capture_groups_in_scope.last().extend(move(last)); + }; + auto register_capture_group_in_current_scope = [&](auto identifier) { + m_capture_groups_in_scope.last().empend(identifier); + }; + auto clear_all_capture_groups_in_scope = [&] { + for (auto& entry : m_capture_groups_in_scope.last()) { + entry.visit( + [&](size_t index) { + stack.insert_bytecode_clear_capture_group(index); + }, + [&](String const& name) { + stack.insert_bytecode_clear_named_capture_group(name); + }); + } + }; + if (match(TokenType::Questionmark)) { // Non-capturing group or group with specifier. consume(); @@ -1885,8 +1907,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi consume(); ByteCode noncapture_group_bytecode; size_t length = 0; + + enter_capture_group_scope(); if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named)) return set_error(Error::InvalidPattern); + clear_all_capture_groups_in_scope(); + exit_capture_group_scope(); consume(TokenType::RightParen, Error::MismatchingParen); @@ -1907,8 +1933,14 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi ByteCode capture_group_bytecode; size_t length = 0; + enter_capture_group_scope(); if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) return set_error(Error::InvalidPattern); + clear_all_capture_groups_in_scope(); + exit_capture_group_scope(); + + register_capture_group_in_current_scope(name); + register_capture_group_in_current_scope(group_index); consume(TokenType::RightParen, Error::MismatchingParen); @@ -1930,7 +1962,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi } auto group_index = ++m_parser_state.capture_groups_count; - stack.insert_bytecode_group_capture_left(group_index); + enter_capture_group_scope(); ByteCode capture_group_bytecode; size_t length = 0; @@ -1938,6 +1970,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) return set_error(Error::InvalidPattern); + clear_all_capture_groups_in_scope(); + exit_capture_group_scope(); + + register_capture_group_in_current_scope(group_index); + + stack.insert_bytecode_group_capture_left(group_index); stack.extend(move(capture_group_bytecode)); m_parser_state.capture_group_minimum_lengths.set(group_index, length); diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 5be99edbba..edfd64e3a5 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -190,12 +190,14 @@ public: explicit ECMA262Parser(Lexer& lexer) : Parser(lexer) { + m_capture_groups_in_scope.empend(); } ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options) : Parser(lexer, regex_options.value_or({})) { m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended); + m_capture_groups_in_scope.empend(); } ~ECMA262Parser() = default; @@ -242,6 +244,12 @@ private: // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag. bool m_should_use_browser_extended_grammar { false }; + + // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it, + // by requiring that (...)+ only contain the matches for the last iteration. + // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed. + using CaptureGroup = Variant<size_t, String>; + Vector<Vector<CaptureGroup>> m_capture_groups_in_scope; }; using PosixExtended = PosixExtendedParser; |