summaryrefslogtreecommitdiff
path: root/Userland/Libraries
diff options
context:
space:
mode:
authorAli Mohammad Pur <ali.mpfard@gmail.com>2021-10-03 19:01:25 +0330
committerAndreas Kling <kling@serenityos.org>2021-10-03 19:16:36 +0200
commit8f722302d9c038ae9c307bc553e55b8dbbcff7ed (patch)
tree0fa7cafcc822db448c6ffbe422d824f17a8e4989 /Userland/Libraries
parent478b36c37b887d6d33915738111e9063778ceab2 (diff)
downloadserenity-8f722302d9c038ae9c307bc553e55b8dbbcff7ed.zip
LibRegex: Use a match table for character classes
Generate a sorted, compressed series of ranges in a match table for character classes, and use a binary search to find the matches. This is about a 3-4x speedup for character class match performance. :^)
Diffstat (limited to 'Userland/Libraries')
-rw-r--r--Userland/Libraries/LibRegex/Forward.h2
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.cpp47
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.h52
-rw-r--r--Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h2
-rw-r--r--Userland/Libraries/LibRegex/RegexOptimizer.cpp146
5 files changed, 214 insertions, 35 deletions
diff --git a/Userland/Libraries/LibRegex/Forward.h b/Userland/Libraries/LibRegex/Forward.h
index a84b4820cf..0c5516f7dd 100644
--- a/Userland/Libraries/LibRegex/Forward.h
+++ b/Userland/Libraries/LibRegex/Forward.h
@@ -9,6 +9,8 @@
#include <AK/Types.h>
namespace regex {
+struct CompareTypeAndValuePair;
+
enum class Error : u8;
class Lexer;
class PosixExtendedParser;
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index 9f979d75f7..e96bb67598 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -7,6 +7,7 @@
#include "RegexByteCode.h"
#include "AK/StringBuilder.h"
#include "RegexDebug.h"
+#include <AK/BinarySearch.h>
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <LibUnicode/CharacterTypes.h>
@@ -491,6 +492,38 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
+ } else if (compare_type == CharacterCompareType::LookupTable) {
+ if (input.view.length() <= state.string_position)
+ return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+ auto count = m_bytecode->at(offset++);
+ auto range_data = m_bytecode->spans().slice(offset, count);
+ offset += count;
+
+ auto ch = input.view.substring_view(state.string_position, 1)[0];
+
+ auto matching_range = binary_search(range_data, ch, nullptr, [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) {
+ auto from = range.from;
+ auto to = range.to;
+ if (insensitive) {
+ from = to_ascii_lowercase(from);
+ to = to_ascii_lowercase(to);
+ needle = to_ascii_lowercase(needle);
+ }
+ if (needle > range.to)
+ return 1;
+ if (needle < range.from)
+ return -1;
+ return 0;
+ });
+
+ if (matching_range) {
+ if (current_inversion_state())
+ inverse_matched = true;
+ else
+ advance_string_position(state, input.view, ch);
+ }
+
} else if (compare_type == CharacterCompareType::CharRange) {
if (input.view.length() <= state.string_position)
return ExecutionResult::Failed_ExecuteLowPrioForks;
@@ -816,6 +849,10 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = m_bytecode->at(offset++);
result.append({ compare_type, value });
+ } else if (compare_type == CharacterCompareType::LookupTable) {
+ auto count = m_bytecode->at(offset++);
+ for (size_t i = 0; i < count; ++i)
+ result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) });
} else {
result.append({ compare_type, 0 });
}
@@ -884,6 +921,16 @@ Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<Match
result.empend(String::formatted(
"compare against: '{}'",
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
+ } else if (compare_type == CharacterCompareType::LookupTable) {
+ auto count = m_bytecode->at(offset++);
+ for (size_t j = 0; j < count; ++j) {
+ auto range = (CharRange)m_bytecode->at(offset++);
+ result.append(String::formatted("{:x}-{:x}", range.from, range.to));
+ }
+ if (!view.is_null() && view.length() > state().string_position)
+ result.empend(String::formatted(
+ "compare against: '{}'",
+ input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
}
}
return result;
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
index cea60b9fcf..950f668645 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -61,21 +61,22 @@ enum class OpCodeId : ByteCodeValueType {
};
// clang-format on
-#define ENUMERATE_CHARACTER_COMPARE_TYPES \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
- __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
+#define ENUMERATE_CHARACTER_COMPARE_TYPES \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)
enum class CharacterCompareType : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
@@ -186,26 +187,7 @@ public:
void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
{
- ByteCode bytecode;
-
- bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
- bytecode.empend(pairs.size()); // number of arguments
-
- ByteCode arguments;
- for (auto& value : pairs) {
- VERIFY(value.type != CharacterCompareType::RangeExpressionDummy);
- VERIFY(value.type != CharacterCompareType::Undefined);
- VERIFY(value.type != CharacterCompareType::String);
-
- arguments.append((ByteCodeValueType)value.type);
- if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
- arguments.append(move(value.value));
- }
-
- bytecode.empend(arguments.size()); // size of arguments
- bytecode.extend(move(arguments));
-
- extend(move(bytecode));
+ Optimizer::append_character_class(*this, move(pairs));
}
void insert_bytecode_check_boundary(BoundaryCheckType type)
diff --git a/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h b/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h
index 7fff2809f5..373577e458 100644
--- a/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h
+++ b/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h
@@ -7,12 +7,14 @@
#pragma once
#include "Forward.h"
+#include <AK/Vector.h>
namespace regex {
class Optimizer {
public:
static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
+ static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs);
};
}
diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp
index 4546f607b6..cd225f2d90 100644
--- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp
+++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp
@@ -488,6 +488,152 @@ void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&&
// LABEL _END = alterantive_bytecode.size
}
+enum class LookupTableInsertionOutcome {
+ Successful,
+ ReplaceWithAnyChar,
+ TemporaryInversionNeeded,
+ PermanentInversionNeeded,
+ CannotPlaceInTable,
+};
+static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
+{
+ switch (pair.type) {
+ case CharacterCompareType::Inverse:
+ return LookupTableInsertionOutcome::PermanentInversionNeeded;
+ case CharacterCompareType::TemporaryInverse:
+ return LookupTableInsertionOutcome::TemporaryInversionNeeded;
+ case CharacterCompareType::AnyChar:
+ return LookupTableInsertionOutcome::ReplaceWithAnyChar;
+ case CharacterCompareType::CharClass:
+ return LookupTableInsertionOutcome::CannotPlaceInTable;
+ case CharacterCompareType::Char:
+ table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
+ break;
+ case CharacterCompareType::CharRange: {
+ CharRange range { pair.value };
+ table.insert(range.from, range);
+ break;
+ }
+ case CharacterCompareType::Reference:
+ case CharacterCompareType::Property:
+ case CharacterCompareType::GeneralCategory:
+ case CharacterCompareType::Script:
+ case CharacterCompareType::ScriptExtension:
+ return LookupTableInsertionOutcome::CannotPlaceInTable;
+ case CharacterCompareType::Undefined:
+ case CharacterCompareType::RangeExpressionDummy:
+ case CharacterCompareType::String:
+ case CharacterCompareType::LookupTable:
+ VERIFY_NOT_REACHED();
+ }
+
+ return LookupTableInsertionOutcome::Successful;
+}
+
+void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
+{
+ ByteCode arguments;
+ size_t argument_count = 0;
+
+ if (pairs.size() <= 1) {
+ for (auto& pair : pairs) {
+ arguments.append(to_underlying(pair.type));
+ if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse)
+ arguments.append(pair.value);
+ ++argument_count;
+ }
+ } else {
+ RedBlackTree<ByteCodeValueType, CharRange> table;
+ RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
+ auto* current_table = &table;
+ auto* current_inverted_table = &inverted_table;
+ bool invert_for_next_iteration = false;
+ bool is_currently_inverted = false;
+
+ for (auto& value : pairs) {
+ auto should_invert_after_this_iteration = invert_for_next_iteration;
+ invert_for_next_iteration = false;
+
+ auto insertion_result = insert_into_lookup_table(*current_table, value);
+ switch (insertion_result) {
+ case LookupTableInsertionOutcome::Successful:
+ break;
+ case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
+ table.clear();
+ inverted_table.clear();
+ arguments.append(to_underlying(CharacterCompareType::AnyChar));
+ ++argument_count;
+ break;
+ }
+ case LookupTableInsertionOutcome::TemporaryInversionNeeded:
+ swap(current_table, current_inverted_table);
+ invert_for_next_iteration = true;
+ is_currently_inverted = !is_currently_inverted;
+ break;
+ case LookupTableInsertionOutcome::PermanentInversionNeeded:
+ swap(current_table, current_inverted_table);
+ is_currently_inverted = !is_currently_inverted;
+ break;
+ case LookupTableInsertionOutcome::CannotPlaceInTable:
+ if (is_currently_inverted) {
+ arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
+ ++argument_count;
+ }
+ arguments.append(to_underlying(value.type));
+ arguments.append(value.value);
+ ++argument_count;
+ break;
+ }
+
+ if (should_invert_after_this_iteration) {
+ swap(current_table, current_inverted_table);
+ is_currently_inverted = !is_currently_inverted;
+ }
+ }
+ auto append_table = [&](auto& table) {
+ ++argument_count;
+ arguments.append(to_underlying(CharacterCompareType::LookupTable));
+ auto size_index = arguments.size();
+ arguments.append(0);
+ Optional<CharRange> active_range;
+ size_t range_count = 0;
+ for (auto& range : table) {
+ if (!active_range.has_value()) {
+ active_range = range;
+ continue;
+ }
+
+ if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
+ active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
+ } else {
+ ++range_count;
+ arguments.append(active_range.release_value());
+ active_range = range;
+ }
+ }
+ if (active_range.has_value()) {
+ ++range_count;
+ arguments.append(active_range.release_value());
+ }
+ arguments[size_index] = range_count;
+ };
+
+ if (!table.is_empty())
+ append_table(table);
+
+ if (!inverted_table.is_empty()) {
+ ++argument_count;
+ arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
+ append_table(inverted_table);
+ }
+ }
+
+ target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
+ target.empend(argument_count); // number of arguments
+ target.empend(arguments.size()); // size of arguments
+ target.extend(move(arguments));
+}
+
template void Regex<PosixBasicParser>::run_optimization_passes();
template void Regex<PosixExtendedParser>::run_optimization_passes();
template void Regex<ECMA262Parser>::run_optimization_passes();