diff options
author | Ali Mohammad Pur <ali.mpfard@gmail.com> | 2021-09-15 14:31:55 +0430 |
---|---|---|
committer | Ali Mohammad Pur <Ali.mpfard@gmail.com> | 2021-09-15 15:52:28 +0430 |
commit | 741886a4c46fc02672da40bf9057b43454ad52cc (patch) | |
tree | f12bd6ef5476801289c02b1627498fb87bf065c3 | |
parent | 7de23aede23c9701e24240eb8ffa1796fe24d82b (diff) | |
download | serenity-741886a4c46fc02672da40bf9057b43454ad52cc.zip |
LibRegex: Make the optimizer understand references and capture groups
Otherwise the fork in patterns like `(1+)\1` would be (incorrectly)
optimized away.
-rw-r--r-- | Tests/LibRegex/Regex.cpp | 4 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexOptimizer.cpp | 21 |
2 files changed, 23 insertions, 2 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 4aeffb8da8..bd217501fc 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -898,6 +898,10 @@ TEST_CASE(optimizer_atomic_groups) // Alternative fuse Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true }, Tuple { "(a|a)"sv, "a"sv, true }, + // ForkReplace shouldn't be applied where it would change the semantics + Tuple { "(1+)\\1"sv, "11"sv, true }, + Tuple { "(1+)1"sv, "11"sv, true }, + Tuple { "(1+)0"sv, "10"sv, true }, }; for (auto& test : tests) { diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp index d19c2178b0..fc82f0d6ac 100644 --- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp @@ -104,6 +104,7 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks() static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block) { Vector<Vector<CompareTypeAndValuePair>> repeated_values; + HashTable<size_t> active_capture_groups; MatchState state; for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) { auto& opcode = bytecode.get_opcode(state); @@ -126,6 +127,12 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode case OpCodeId::Restore: case OpCodeId::GoBack: return false; + case OpCodeId::SaveRightCaptureGroup: + active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id()); + break; + case OpCodeId::SaveLeftCaptureGroup: + active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id()); + break; default: break; } @@ -133,19 +140,29 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode state.instruction_position += opcode.size(); } dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size()); + dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size()); // Find the first compare in the following block, it must NOT match any of the values in `repeated_values'. for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) { auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { + // Note: These have to exist since we're effectively repeating the following block as well + case OpCodeId::SaveRightCaptureGroup: + active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id()); + break; + case OpCodeId::SaveLeftCaptureGroup: + active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id()); + break; case OpCodeId::Compare: { // We found a compare, let's see what it has. auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares(); if (compares.is_empty()) break; - // If either side can match _anything_, fail. - if (any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; })) + if (any_of(compares, [&](auto& compare) { + return compare.type == CharacterCompareType::AnyChar + || (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value)); + })) return false; for (auto& repeated_value : repeated_values) { |