summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAli Mohammad Pur <ali.mpfard@gmail.com>2021-09-15 14:31:55 +0430
committerAli Mohammad Pur <Ali.mpfard@gmail.com>2021-09-15 15:52:28 +0430
commit741886a4c46fc02672da40bf9057b43454ad52cc (patch)
treef12bd6ef5476801289c02b1627498fb87bf065c3
parent7de23aede23c9701e24240eb8ffa1796fe24d82b (diff)
downloadserenity-741886a4c46fc02672da40bf9057b43454ad52cc.zip
LibRegex: Make the optimizer understand references and capture groups
Otherwise the fork in patterns like `(1+)\1` would be (incorrectly) optimized away.
-rw-r--r--Tests/LibRegex/Regex.cpp4
-rw-r--r--Userland/Libraries/LibRegex/RegexOptimizer.cpp21
2 files changed, 23 insertions, 2 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp
index 4aeffb8da8..bd217501fc 100644
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@@ -898,6 +898,10 @@ TEST_CASE(optimizer_atomic_groups)
// Alternative fuse
Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true },
Tuple { "(a|a)"sv, "a"sv, true },
+ // ForkReplace shouldn't be applied where it would change the semantics
+ Tuple { "(1+)\\1"sv, "11"sv, true },
+ Tuple { "(1+)1"sv, "11"sv, true },
+ Tuple { "(1+)0"sv, "10"sv, true },
};
for (auto& test : tests) {
diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp
index d19c2178b0..fc82f0d6ac 100644
--- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp
+++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp
@@ -104,6 +104,7 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks()
static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block)
{
Vector<Vector<CompareTypeAndValuePair>> repeated_values;
+ HashTable<size_t> active_capture_groups;
MatchState state;
for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) {
auto& opcode = bytecode.get_opcode(state);
@@ -126,6 +127,12 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode
case OpCodeId::Restore:
case OpCodeId::GoBack:
return false;
+ case OpCodeId::SaveRightCaptureGroup:
+ active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
+ break;
+ case OpCodeId::SaveLeftCaptureGroup:
+ active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
+ break;
default:
break;
}
@@ -133,19 +140,29 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode
state.instruction_position += opcode.size();
}
dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size());
+ dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size());
// Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) {
auto& opcode = bytecode.get_opcode(state);
switch (opcode.opcode_id()) {
+ // Note: These have to exist since we're effectively repeating the following block as well
+ case OpCodeId::SaveRightCaptureGroup:
+ active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
+ break;
+ case OpCodeId::SaveLeftCaptureGroup:
+ active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
+ break;
case OpCodeId::Compare: {
// We found a compare, let's see what it has.
auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
if (compares.is_empty())
break;
- // If either side can match _anything_, fail.
- if (any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
+ if (any_of(compares, [&](auto& compare) {
+ return compare.type == CharacterCompareType::AnyChar
+ || (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value));
+ }))
return false;
for (auto& repeated_value : repeated_values) {