diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-14 16:28:54 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-08-15 11:43:45 +0100 |
commit | f1ce998d73979b80352675d6d3170399b2311913 (patch) | |
tree | ab15c8d1713571bbbf87591c7d3a8e1b0e7b5f58 /Userland/Libraries/LibRegex | |
parent | fea181bde35b115de33adb5f80c3f6abe46cec72 (diff) | |
download | serenity-f1ce998d73979b80352675d6d3170399b2311913.zip |
LibRegex+LibJS: Combine named and unnamed capture groups in MatchState
Combining these into one list helps reduce the size of MatchState, and
as a result, reduces the amount of memory consumed during execution of
very large regex matches.
Doing this also allows us to remove a few regex byte code instructions:
ClearNamedCaptureGroup, SaveLeftNamedCaptureGroup, and NamedReference.
Named groups now behave the same as unnamed groups for these operations.
Note that SaveRightNamedCaptureGroup still exists to cache the matched
group name.
This also removes the recursion level from the MatchState, as it can
exist as a local variable in Matcher::execute instead.
Diffstat (limited to 'Userland/Libraries/LibRegex')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 80 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.h | 69 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatch.h | 15 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatcher.cpp | 22 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatcher.h | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 39 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 11 |
7 files changed, 52 insertions, 185 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 7cad14ba92..3d9b1d3882 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -166,18 +166,12 @@ void ByteCode::ensure_opcodes_initialized() case OpCodeId::ClearCaptureGroup: s_opcodes[i] = make<OpCode_ClearCaptureGroup>(); break; - case OpCodeId::ClearNamedCaptureGroup: - s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>(); - break; case OpCodeId::SaveLeftCaptureGroup: s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>(); break; case OpCodeId::SaveRightCaptureGroup: s_opcodes[i] = make<OpCode_SaveRightCaptureGroup>(); break; - case OpCodeId::SaveLeftNamedCaptureGroup: - s_opcodes[i] = make<OpCode_SaveLeftNamedCaptureGroup>(); - break; case OpCodeId::SaveRightNamedCaptureGroup: s_opcodes[i] = make<OpCode_SaveRightNamedCaptureGroup>(); break; @@ -378,52 +372,26 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput c return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const -{ - if (input.match_index < state.capture_group_matches.size()) { - auto& group = state.named_capture_group_matches[input.match_index]; - if (auto it = group.find(name()); it != group.end()) - it->value.reset(); - } - return ExecutionResult::Continue; -} - -ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const -{ - if (input.match_index >= state.named_capture_group_matches.size()) { - state.named_capture_group_matches.ensure_capacity(input.match_index); - auto capacity = state.named_capture_group_matches.capacity(); - for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i) - state.named_capture_group_matches.empend(); - } - state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position; - return ExecutionResult::Continue; -} - ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const { - StringView capture_group_name = name(); + auto& match = state.capture_group_matches.at(input.match_index).at(id()); + auto start_position = match.left_column; + if (state.string_position < start_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + auto length = state.string_position - start_position; - if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) { - auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column; - auto length = state.string_position - start_position; + if (start_position < match.column) + return ExecutionResult::Continue; - auto& map = state.named_capture_group_matches.at(input.match_index); + VERIFY(start_position + length <= input.view.length()); - if constexpr (REGEX_DEBUG) { - VERIFY(start_position + length <= input.view.length()); - dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length)); - } + auto view = input.view.substring_view(start_position, length); - VERIFY(start_position + length <= input.view.length()); - auto view = input.view.substring_view(start_position, length); - if (input.regex_options & AllFlags::StringCopyMatches) { - map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string - } else { - map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string - } + if (input.regex_options & AllFlags::StringCopyMatches) { + match = { view.to_string(), name(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string } else { - warnln("Didn't find corresponding capture group match for name={}, match_index={}", capture_group_name.to_string(), input.match_index); + match = { view, name(), input.line, start_position, input.global_offset + start_position }; // take view to original string } return ExecutionResult::Continue; @@ -543,24 +511,6 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (!compare_string(input, state, str, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; - } else if (compare_type == CharacterCompareType::NamedReference) { - auto ptr = (char const*)m_bytecode->at(offset++); - auto length = (size_t)m_bytecode->at(offset++); - StringView name { ptr, length }; - - auto group = state.named_capture_group_matches.at(input.match_index).get(name); - if (!group.has_value()) - return ExecutionResult::Failed_ExecuteLowPrioForks; - - auto str = group.value().view; - - // We want to compare a string that is definitely longer than the available string - if (input.view.length() < state.string_position + str.length()) - return ExecutionResult::Failed_ExecuteLowPrioForks; - - if (!compare_string(input, state, str, had_zero_length_match)) - return ExecutionResult::Failed_ExecuteLowPrioForks; - } else if (compare_type == CharacterCompareType::Property) { auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++)); compare_property(input, state, property, current_inversion_state(), inverse_matched); @@ -869,10 +819,6 @@ Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<Match buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])); } } - } else if (compare_type == CharacterCompareType::NamedReference) { - auto ptr = (char const*)m_bytecode->at(offset++); - auto length = m_bytecode->at(offset++); - result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length })); } else if (compare_type == CharacterCompareType::Reference) { auto ref = m_bytecode->at(offset++); result.empend(String::formatted("number={}", ref)); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index c1c7af235b..ab305785e2 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -32,7 +32,6 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(FailForks) \ __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ - __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ __ENUMERATE_OPCODE(CheckBegin) \ __ENUMERATE_OPCODE(CheckEnd) \ @@ -41,7 +40,6 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(Restore) \ __ENUMERATE_OPCODE(GoBack) \ __ENUMERATE_OPCODE(ClearCaptureGroup) \ - __ENUMERATE_OPCODE(ClearNamedCaptureGroup) \ __ENUMERATE_OPCODE(Exit) // clang-format off @@ -65,7 +63,6 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ @@ -159,7 +156,6 @@ public: VERIFY(value.type != CharacterCompareType::RangeExpressionDummy); VERIFY(value.type != CharacterCompareType::Undefined); VERIFY(value.type != CharacterCompareType::String); - VERIFY(value.type != CharacterCompareType::NamedReference); arguments.append((ByteCodeValueType)value.type); if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse) @@ -187,13 +183,6 @@ public: empend(index); } - void insert_bytecode_clear_named_capture_group(StringView name) - { - empend(static_cast<ByteCodeValueType>(OpCodeId::ClearNamedCaptureGroup)); - empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); - empend(name.length()); - } - void insert_bytecode_compare_string(StringView view) { ByteCode bytecode; @@ -212,49 +201,24 @@ public: extend(move(bytecode)); } - void insert_bytecode_compare_named_reference(StringView name) - { - ByteCode bytecode; - - bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare)); - bytecode.empend(static_cast<u64>(1)); // number of arguments - - ByteCode arguments; - - arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::NamedReference)); - arguments.empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); - arguments.empend(name.length()); - - bytecode.empend(arguments.size()); // size of arguments - bytecode.extend(move(arguments)); - - extend(move(bytecode)); - } - void insert_bytecode_group_capture_left(size_t capture_groups_count) { empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup)); empend(capture_groups_count); } - void insert_bytecode_group_capture_left(StringView const& name) - { - empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup)); - empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); - empend(name.length()); - } - void insert_bytecode_group_capture_right(size_t capture_groups_count) { empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup)); empend(capture_groups_count); } - void insert_bytecode_group_capture_right(StringView const& name) + void insert_bytecode_group_capture_right(size_t capture_groups_count, StringView const& name) { empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup)); empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); empend(name.length()); + empend(capture_groups_count); } enum class LookAroundType { @@ -655,19 +619,6 @@ public: String const arguments_string() const override { return String::formatted("id={}", id()); } }; -class OpCode_ClearNamedCaptureGroup final : public OpCode { -public: - ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; - ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; } - ALWAYS_INLINE size_t size() const override { return 3; } - ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; } - ALWAYS_INLINE size_t length() const { return argument(1); } - String const arguments_string() const override - { - return String::formatted("name={}, length={}", name(), length()); - } -}; - class OpCode_SaveLeftCaptureGroup final : public OpCode { public: ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; @@ -686,26 +637,14 @@ public: String const arguments_string() const override { return String::formatted("id={}", id()); } }; -class OpCode_SaveLeftNamedCaptureGroup final : public OpCode { -public: - ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; - ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; } - ALWAYS_INLINE size_t size() const override { return 3; } - ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; } - ALWAYS_INLINE size_t length() const { return argument(1); } - String const arguments_string() const override - { - return String::formatted("name={}, length={}", name(), length()); - } -}; - class OpCode_SaveRightNamedCaptureGroup final : public OpCode { public: ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; } - ALWAYS_INLINE size_t size() const override { return 3; } + ALWAYS_INLINE size_t size() const override { return 4; } ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; } ALWAYS_INLINE size_t length() const { return argument(1); } + ALWAYS_INLINE size_t id() const { return argument(2); } String const arguments_string() const override { return String::formatted("name={}, length={}", name(), length()); diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 599e68451a..98c324b359 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -442,11 +442,20 @@ public: } Match(String const string_, size_t const line_, size_t const column_, size_t const global_offset_) - : string(string_) + : string(move(string_)) , view(string.value().view()) , line(line_) , column(column_) , global_offset(global_offset_) + { + } + + Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_) + : view(view_) + , capture_group_name(capture_group_name_) + , line(line_) + , column(column_) + , global_offset(global_offset_) , left_column(column_) { } @@ -454,6 +463,7 @@ public: void reset() { view = view.typed_null_view(); + capture_group_name.clear(); line = 0; column = 0; global_offset = 0; @@ -461,6 +471,7 @@ public: } RegexStringView view { nullptr }; + Optional<StringView> capture_group_name {}; size_t line { 0 }; size_t column { 0 }; size_t global_offset { 0 }; @@ -494,8 +505,6 @@ struct MatchState { size_t fork_at_position { 0 }; Vector<Match> matches; Vector<Vector<Match>> capture_group_matches; - Vector<HashMap<String, Match>> named_capture_group_matches; - size_t recursion_level { 0 }; }; struct MatchOutput { diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index c312ca3c81..0d41c9af42 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -149,10 +149,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona if (c_match_preallocation_count) { state.matches.ensure_capacity(c_match_preallocation_count); state.capture_group_matches.ensure_capacity(c_match_preallocation_count); - state.named_capture_group_matches.ensure_capacity(c_match_preallocation_count); - auto& capture_groups_count = m_pattern->parser_result.capture_groups_count; - auto& named_capture_groups_count = m_pattern->parser_result.named_capture_groups_count; for (size_t j = 0; j < c_match_preallocation_count; ++j) { state.matches.empend(); @@ -160,9 +157,6 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona state.capture_group_matches.at(j).ensure_capacity(capture_groups_count); for (size_t k = 0; k < capture_groups_count; ++k) state.capture_group_matches.at(j).unchecked_append({}); - - state.named_capture_group_matches.unchecked_append({}); - state.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count); } } @@ -315,15 +309,9 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona matches.template remove_all_matching([](auto& match) { return match.view.is_null(); }); } - output_copy.named_capture_group_matches = state.named_capture_group_matches; - // Make sure there are as many capture matches as there are actual matches. - if (output_copy.named_capture_group_matches.size() < match_count) - output_copy.named_capture_group_matches.resize(match_count); - output_copy.matches = state.matches; } else { output_copy.capture_group_matches.clear_with_capacity(); - output_copy.named_capture_group_matches.clear_with_capacity(); } return { @@ -331,7 +319,6 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona match_count, move(output_copy.matches), move(output_copy.capture_group_matches), - move(output_copy.named_capture_group_matches), output.operations, m_pattern->parser_result.capture_groups_count, m_pattern->parser_result.named_capture_groups_count, @@ -399,9 +386,8 @@ private: template<class Parser> Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& state, MatchOutput& output) const { - state.recursion_level = 0; - BumpAllocatedLinkedList<MatchState> states_to_try_next; + size_t recursion_level = 0; auto& bytecode = m_pattern->parser_result.bytecode; @@ -410,7 +396,7 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta auto& opcode = bytecode.get_opcode(state); #if REGEX_DEBUG - s_regex_dbg.print_opcode("VM", opcode, state, state.recursion_level, false); + s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false); #endif ExecutionResult result; @@ -435,7 +421,7 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta case ExecutionResult::Fork_PrioHigh: states_to_try_next.append(state); state.instruction_position = state.fork_at_position; - ++state.recursion_level; + ++recursion_level; continue; case ExecutionResult::Continue: continue; @@ -454,7 +440,7 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta return false; } state = states_to_try_next.take_last(); - ++state.recursion_level; + ++recursion_level; continue; } } diff --git a/Userland/Libraries/LibRegex/RegexMatcher.h b/Userland/Libraries/LibRegex/RegexMatcher.h index 8e1ce64e6d..4a1e64b8f4 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.h +++ b/Userland/Libraries/LibRegex/RegexMatcher.h @@ -31,7 +31,6 @@ struct RegexResult final { size_t count { 0 }; Vector<Match> matches; Vector<Vector<Match>> capture_group_matches; - Vector<HashMap<String, Match>> named_capture_group_matches; size_t n_operations { 0 }; size_t n_capture_groups { 0 }; size_t n_named_capture_groups { 0 }; diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 84768cf121..db08d836fc 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -150,7 +150,6 @@ ALWAYS_INLINE void Parser::reset() m_parser_state.capture_group_minimum_lengths.clear(); m_parser_state.capture_groups_count = 0; m_parser_state.named_capture_groups_count = 0; - m_parser_state.named_capture_group_minimum_lengths.clear(); m_parser_state.named_capture_groups.clear(); } @@ -780,12 +779,8 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si } } - if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { - if (capture_group_name.has_value()) - bytecode.insert_bytecode_group_capture_left(capture_group_name.value()); - else - bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count); - } + if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) + bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count); ByteCode capture_group_bytecode; @@ -814,12 +809,12 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { if (capture_group_name.has_value()) { - bytecode.insert_bytecode_group_capture_right(capture_group_name.value()); + bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count, capture_group_name.value()); ++m_parser_state.named_capture_groups_count; } else { bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count); - ++m_parser_state.capture_groups_count; } + ++m_parser_state.capture_groups_count; } should_parse_repetition_symbol = true; break; @@ -1564,14 +1559,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini set_error(Error::InvalidNameForCaptureGroup); return false; } - auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name); - if (!maybe_length.has_value()) { + auto maybe_capture_group = m_parser_state.named_capture_groups.get(name); + if (!maybe_capture_group.has_value()) { set_error(Error::InvalidNameForCaptureGroup); return false; } - match_length_minimum += maybe_length.value(); + match_length_minimum += maybe_capture_group->minimum_length; - stack.insert_bytecode_compare_named_reference(name); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group->group_index } }); return true; } @@ -2121,15 +2116,8 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi m_capture_groups_in_scope.last().empend(identifier); }; auto clear_all_capture_groups_in_scope = [&] { - for (auto& entry : m_capture_groups_in_scope.last()) { - entry.visit( - [&](size_t index) { - stack.insert_bytecode_clear_capture_group(index); - }, - [&](String const& name) { - stack.insert_bytecode_clear_named_capture_group(name); - }); - } + for (auto& index : m_capture_groups_in_scope.last()) + stack.insert_bytecode_clear_capture_group(index); }; if (match(TokenType::Questionmark)) { @@ -2172,21 +2160,18 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi clear_all_capture_groups_in_scope(); exit_capture_group_scope(); - register_capture_group_in_current_scope(name); register_capture_group_in_current_scope(group_index); consume(TokenType::RightParen, Error::MismatchingParen); - stack.insert_bytecode_group_capture_left(name); stack.insert_bytecode_group_capture_left(group_index); stack.extend(move(capture_group_bytecode)); - stack.insert_bytecode_group_capture_right(name); - stack.insert_bytecode_group_capture_right(group_index); + stack.insert_bytecode_group_capture_right(group_index, name); match_length_minimum += length; - m_parser_state.named_capture_group_minimum_lengths.set(name, length); m_parser_state.capture_group_minimum_lengths.set(group_index, length); + m_parser_state.named_capture_groups.set(name, { group_index, length }); return true; } diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 25a9677189..c1f54be69c 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -88,6 +88,11 @@ protected: ALWAYS_INLINE bool done() const; ALWAYS_INLINE bool set_error(Error error); + struct NamedCaptureGroup { + size_t group_index { 0 }; + size_t minimum_length { 0 }; + }; + struct ParserState { Lexer& lexer; Token current_token; @@ -99,8 +104,7 @@ protected: size_t match_length_minimum { 0 }; AllOptions regex_options; HashMap<int, size_t> capture_group_minimum_lengths; - HashMap<FlyString, size_t> named_capture_group_minimum_lengths; - HashMap<size_t, FlyString> named_capture_groups; + HashMap<FlyString, NamedCaptureGroup> named_capture_groups; explicit ParserState(Lexer& lexer) : lexer(lexer) @@ -258,8 +262,7 @@ private: // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it, // by requiring that (...)+ only contain the matches for the last iteration. // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed. - using CaptureGroup = Variant<size_t, String>; - Vector<Vector<CaptureGroup>> m_capture_groups_in_scope; + Vector<Vector<size_t>> m_capture_groups_in_scope; }; using PosixExtended = PosixExtendedParser; |