diff options
author | Ali Mohammad Pur <ali.mpfard@gmail.com> | 2021-10-03 19:01:25 +0330 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-10-03 19:16:36 +0200 |
commit | 8f722302d9c038ae9c307bc553e55b8dbbcff7ed (patch) | |
tree | 0fa7cafcc822db448c6ffbe422d824f17a8e4989 /Userland/Libraries | |
parent | 478b36c37b887d6d33915738111e9063778ceab2 (diff) | |
download | serenity-8f722302d9c038ae9c307bc553e55b8dbbcff7ed.zip |
LibRegex: Use a match table for character classes
Generate a sorted, compressed series of ranges in a match table for
character classes, and use a binary search to find the matches.
This is about a 3-4x speedup for character class match performance. :^)
Diffstat (limited to 'Userland/Libraries')
-rw-r--r-- | Userland/Libraries/LibRegex/Forward.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 47 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.h | 52 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexOptimizer.cpp | 146 |
5 files changed, 214 insertions, 35 deletions
diff --git a/Userland/Libraries/LibRegex/Forward.h b/Userland/Libraries/LibRegex/Forward.h index a84b4820cf..0c5516f7dd 100644 --- a/Userland/Libraries/LibRegex/Forward.h +++ b/Userland/Libraries/LibRegex/Forward.h @@ -9,6 +9,8 @@ #include <AK/Types.h> namespace regex { +struct CompareTypeAndValuePair; + enum class Error : u8; class Lexer; class PosixExtendedParser; diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 9f979d75f7..e96bb67598 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -7,6 +7,7 @@ #include "RegexByteCode.h" #include "AK/StringBuilder.h" #include "RegexDebug.h" +#include <AK/BinarySearch.h> #include <AK/CharacterTypes.h> #include <AK/Debug.h> #include <LibUnicode/CharacterTypes.h> @@ -491,6 +492,38 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched); + } else if (compare_type == CharacterCompareType::LookupTable) { + if (input.view.length() <= state.string_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + auto count = m_bytecode->at(offset++); + auto range_data = m_bytecode->spans().slice(offset, count); + offset += count; + + auto ch = input.view.substring_view(state.string_position, 1)[0]; + + auto matching_range = binary_search(range_data, ch, nullptr, [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) { + auto from = range.from; + auto to = range.to; + if (insensitive) { + from = to_ascii_lowercase(from); + to = to_ascii_lowercase(to); + needle = to_ascii_lowercase(needle); + } + if (needle > range.to) + return 1; + if (needle < range.from) + return -1; + return 0; + }); + + if (matching_range) { + if (current_inversion_state()) + inverse_matched = true; + else + advance_string_position(state, input.view, ch); + } + } else if (compare_type == CharacterCompareType::CharRange) { if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; @@ -816,6 +849,10 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const } else if (compare_type == CharacterCompareType::CharRange) { auto value = m_bytecode->at(offset++); result.append({ compare_type, value }); + } else if (compare_type == CharacterCompareType::LookupTable) { + auto count = m_bytecode->at(offset++); + for (size_t i = 0; i < count; ++i) + result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) }); } else { result.append({ compare_type, 0 }); } @@ -884,6 +921,16 @@ Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<Match result.empend(String::formatted( "compare against: '{}'", input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string())); + } else if (compare_type == CharacterCompareType::LookupTable) { + auto count = m_bytecode->at(offset++); + for (size_t j = 0; j < count; ++j) { + auto range = (CharRange)m_bytecode->at(offset++); + result.append(String::formatted("{:x}-{:x}", range.from, range.to)); + } + if (!view.is_null() && view.length() > state().string_position) + result.empend(String::formatted( + "compare against: '{}'", + input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string())); } } return result; diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index cea60b9fcf..950f668645 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -61,21 +61,22 @@ enum class OpCodeId : ByteCodeValueType { }; // clang-format on -#define ENUMERATE_CHARACTER_COMPARE_TYPES \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) +#define ENUMERATE_CHARACTER_COMPARE_TYPES \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) enum class CharacterCompareType : ByteCodeValueType { #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x, @@ -186,26 +187,7 @@ public: void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs) { - ByteCode bytecode; - - bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare)); - bytecode.empend(pairs.size()); // number of arguments - - ByteCode arguments; - for (auto& value : pairs) { - VERIFY(value.type != CharacterCompareType::RangeExpressionDummy); - VERIFY(value.type != CharacterCompareType::Undefined); - VERIFY(value.type != CharacterCompareType::String); - - arguments.append((ByteCodeValueType)value.type); - if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse) - arguments.append(move(value.value)); - } - - bytecode.empend(arguments.size()); // size of arguments - bytecode.extend(move(arguments)); - - extend(move(bytecode)); + Optimizer::append_character_class(*this, move(pairs)); } void insert_bytecode_check_boundary(BoundaryCheckType type) diff --git a/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h b/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h index 7fff2809f5..373577e458 100644 --- a/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h +++ b/Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h @@ -7,12 +7,14 @@ #pragma once #include "Forward.h" +#include <AK/Vector.h> namespace regex { class Optimizer { public: static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right); + static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs); }; } diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp index 4546f607b6..cd225f2d90 100644 --- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp @@ -488,6 +488,152 @@ void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& // LABEL _END = alterantive_bytecode.size } +enum class LookupTableInsertionOutcome { + Successful, + ReplaceWithAnyChar, + TemporaryInversionNeeded, + PermanentInversionNeeded, + CannotPlaceInTable, +}; +static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair) +{ + switch (pair.type) { + case CharacterCompareType::Inverse: + return LookupTableInsertionOutcome::PermanentInversionNeeded; + case CharacterCompareType::TemporaryInverse: + return LookupTableInsertionOutcome::TemporaryInversionNeeded; + case CharacterCompareType::AnyChar: + return LookupTableInsertionOutcome::ReplaceWithAnyChar; + case CharacterCompareType::CharClass: + return LookupTableInsertionOutcome::CannotPlaceInTable; + case CharacterCompareType::Char: + table.insert(pair.value, { (u32)pair.value, (u32)pair.value }); + break; + case CharacterCompareType::CharRange: { + CharRange range { pair.value }; + table.insert(range.from, range); + break; + } + case CharacterCompareType::Reference: + case CharacterCompareType::Property: + case CharacterCompareType::GeneralCategory: + case CharacterCompareType::Script: + case CharacterCompareType::ScriptExtension: + return LookupTableInsertionOutcome::CannotPlaceInTable; + case CharacterCompareType::Undefined: + case CharacterCompareType::RangeExpressionDummy: + case CharacterCompareType::String: + case CharacterCompareType::LookupTable: + VERIFY_NOT_REACHED(); + } + + return LookupTableInsertionOutcome::Successful; +} + +void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs) +{ + ByteCode arguments; + size_t argument_count = 0; + + if (pairs.size() <= 1) { + for (auto& pair : pairs) { + arguments.append(to_underlying(pair.type)); + if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse) + arguments.append(pair.value); + ++argument_count; + } + } else { + RedBlackTree<ByteCodeValueType, CharRange> table; + RedBlackTree<ByteCodeValueType, CharRange> inverted_table; + auto* current_table = &table; + auto* current_inverted_table = &inverted_table; + bool invert_for_next_iteration = false; + bool is_currently_inverted = false; + + for (auto& value : pairs) { + auto should_invert_after_this_iteration = invert_for_next_iteration; + invert_for_next_iteration = false; + + auto insertion_result = insert_into_lookup_table(*current_table, value); + switch (insertion_result) { + case LookupTableInsertionOutcome::Successful: + break; + case LookupTableInsertionOutcome::ReplaceWithAnyChar: { + table.clear(); + inverted_table.clear(); + arguments.append(to_underlying(CharacterCompareType::AnyChar)); + ++argument_count; + break; + } + case LookupTableInsertionOutcome::TemporaryInversionNeeded: + swap(current_table, current_inverted_table); + invert_for_next_iteration = true; + is_currently_inverted = !is_currently_inverted; + break; + case LookupTableInsertionOutcome::PermanentInversionNeeded: + swap(current_table, current_inverted_table); + is_currently_inverted = !is_currently_inverted; + break; + case LookupTableInsertionOutcome::CannotPlaceInTable: + if (is_currently_inverted) { + arguments.append(to_underlying(CharacterCompareType::TemporaryInverse)); + ++argument_count; + } + arguments.append(to_underlying(value.type)); + arguments.append(value.value); + ++argument_count; + break; + } + + if (should_invert_after_this_iteration) { + swap(current_table, current_inverted_table); + is_currently_inverted = !is_currently_inverted; + } + } + auto append_table = [&](auto& table) { + ++argument_count; + arguments.append(to_underlying(CharacterCompareType::LookupTable)); + auto size_index = arguments.size(); + arguments.append(0); + Optional<CharRange> active_range; + size_t range_count = 0; + for (auto& range : table) { + if (!active_range.has_value()) { + active_range = range; + continue; + } + + if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) { + active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) }; + } else { + ++range_count; + arguments.append(active_range.release_value()); + active_range = range; + } + } + if (active_range.has_value()) { + ++range_count; + arguments.append(active_range.release_value()); + } + arguments[size_index] = range_count; + }; + + if (!table.is_empty()) + append_table(table); + + if (!inverted_table.is_empty()) { + ++argument_count; + arguments.append(to_underlying(CharacterCompareType::TemporaryInverse)); + append_table(inverted_table); + } + } + + target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare)); + target.empend(argument_count); // number of arguments + target.empend(arguments.size()); // size of arguments + target.extend(move(arguments)); +} + template void Regex<PosixBasicParser>::run_optimization_passes(); template void Regex<PosixExtendedParser>::run_optimization_passes(); template void Regex<ECMA262Parser>::run_optimization_passes(); |