diff options
author | Ali Mohammad Pur <ali.mpfard@gmail.com> | 2021-07-18 05:07:01 +0430 |
---|---|---|
committer | Ali Mohammad Pur <Ali.mpfard@gmail.com> | 2021-07-18 21:10:55 +0430 |
commit | f364fcec5da902ca8ae84dc0bca0f23533cebaa4 (patch) | |
tree | c71724ba7f1074671212cbb30adf3b5e6561e923 /Userland/Libraries/LibRegex | |
parent | e5af15a6e913bde254657990628e31c598a4a94f (diff) | |
download | serenity-f364fcec5da902ca8ae84dc0bca0f23533cebaa4.zip |
LibRegex+Everywhere: Make LibRegex more unicode-aware
This commit makes LibRegex (mostly) capable of operating on any of
the three main string views:
- StringView for raw strings
- Utf8View for utf-8 encoded strings
- Utf32View for raw unicode strings
As a result, regexps with unicode strings should be able to properly
handle utf-8 and not stop in the middle of a code point.
A future commit will update LibJS to use the correct type of string
depending on the flags.
Diffstat (limited to 'Userland/Libraries/LibRegex')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 155 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatch.h | 301 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatcher.cpp | 42 |
4 files changed, 302 insertions, 198 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index bede259630..259aaf26af 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -288,30 +288,33 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, return ExecutionResult::Failed_ExecuteLowPrioForks; } -ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { - if (input.match_index >= output.capture_group_matches.size()) { - output.capture_group_matches.ensure_capacity(input.match_index); - auto capacity = output.capture_group_matches.capacity(); - for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i) - output.capture_group_matches.empend(); + if (input.match_index >= state.capture_group_matches.size()) { + state.capture_group_matches.ensure_capacity(input.match_index); + auto capacity = state.capture_group_matches.capacity(); + for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i) + state.capture_group_matches.empend(); } - if (id() >= output.capture_group_matches.at(input.match_index).size()) { - output.capture_group_matches.at(input.match_index).ensure_capacity(id()); - auto capacity = output.capture_group_matches.at(input.match_index).capacity(); - for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i) - output.capture_group_matches.at(input.match_index).empend(); + if (id() >= state.capture_group_matches.at(input.match_index).size()) { + state.capture_group_matches.at(input.match_index).ensure_capacity(id()); + auto capacity = state.capture_group_matches.at(input.match_index).capacity(); + for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i) + state.capture_group_matches.at(input.match_index).empend(); } - output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position; + state.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position; return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { - auto& match = output.capture_group_matches.at(input.match_index).at(id()); + auto& match = state.capture_group_matches.at(input.match_index).at(id()); auto start_position = match.left_column; + if (state.string_position < start_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; + auto length = state.string_position - start_position; if (start_position < match.column) @@ -330,27 +333,27 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { - if (input.match_index >= output.named_capture_group_matches.size()) { - output.named_capture_group_matches.ensure_capacity(input.match_index); - auto capacity = output.named_capture_group_matches.capacity(); - for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i) - output.named_capture_group_matches.empend(); + if (input.match_index >= state.named_capture_group_matches.size()) { + state.named_capture_group_matches.ensure_capacity(input.match_index); + auto capacity = state.named_capture_group_matches.capacity(); + for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i) + state.named_capture_group_matches.empend(); } - output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position; + state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position; return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { StringView capture_group_name = name(); - if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) { - auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column; + if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) { + auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column; auto length = state.string_position - start_position; - auto& map = output.named_capture_group_matches.at(input.match_index); + auto& map = state.named_capture_group_matches.at(input.match_index); if constexpr (REGEX_DEBUG) { VERIFY(start_position + length <= input.view.length()); @@ -371,7 +374,7 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { bool inverse { false }; bool temporary_inverse { false }; @@ -414,14 +417,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M u32 ch = m_bytecode->at(offset++); // We want to compare a string that is longer or equal in length to the available string - if (input.view.length() - state.string_position < 1) + if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; compare_char(input, state, ch, current_inversion_state(), inverse_matched); } else if (compare_type == CharacterCompareType::AnyChar) { // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < 1) + if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; VERIFY(!current_inversion_state()); @@ -431,20 +434,25 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M VERIFY(!current_inversion_state()); const auto& length = m_bytecode->at(offset++); - StringBuilder str_builder; - for (size_t i = 0; i < length; ++i) - str_builder.append(m_bytecode->at(offset++)); // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < length) + if (input.view.length() < state.string_position + length) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match)) + Optional<String> str; + Vector<u32> data; + data.ensure_capacity(length); + for (size_t i = offset; i < offset + length; ++i) + data.unchecked_append(m_bytecode->at(i)); + + auto view = input.view.construct_as_same(data, str); + offset += length; + if (!compare_string(input, state, view, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else if (compare_type == CharacterCompareType::CharClass) { - if (input.view.length() - state.string_position < 1) + if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; auto character_class = (CharClass)m_bytecode->at(offset++); @@ -453,6 +461,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched); } else if (compare_type == CharacterCompareType::CharRange) { + if (input.view.length() <= state.string_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; + auto value = (CharRange)m_bytecode->at(offset++); auto from = value.from; @@ -463,17 +474,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M } else if (compare_type == CharacterCompareType::Reference) { auto reference_number = (size_t)m_bytecode->at(offset++); - auto& groups = output.capture_group_matches.at(input.match_index); + auto& groups = state.capture_group_matches.at(input.match_index); if (groups.size() <= reference_number) return ExecutionResult::Failed_ExecuteLowPrioForks; auto str = groups.at(reference_number).view; // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < str.length()) + if (input.view.length() < state.string_position + str.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match)) + if (!compare_string(input, state, str, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else if (compare_type == CharacterCompareType::NamedReference) { @@ -481,17 +492,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M auto length = (size_t)m_bytecode->at(offset++); StringView name { ptr, length }; - auto group = output.named_capture_group_matches.at(input.match_index).get(name); + auto group = state.named_capture_group_matches.at(input.match_index).get(name); if (!group.has_value()) return ExecutionResult::Failed_ExecuteLowPrioForks; auto str = group.value().view; // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < str.length()) + if (input.view.length() < state.string_position + str.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match)) + if (!compare_string(input, state, str, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else { @@ -512,14 +523,19 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched) { - u32 ch2 = input.view[state.string_position]; + if (state.string_position == input.view.length()) + return; - if (input.regex_options & AllFlags::Insensitive) { - ch1 = to_ascii_lowercase(ch1); - ch2 = to_ascii_lowercase(ch2); - } + auto input_view = input.view.substring_view(state.string_position, 1); + Optional<String> str; + auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str); + bool equal; + if (input.regex_options & AllFlags::Insensitive) + equal = input_view.equals_ignoring_case(compare_view); + else + equal = input_view.equals(compare_view); - if (ch1 == ch2) { + if (equal) { if (inverse) inverse_matched = true; else @@ -527,41 +543,32 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchSt } } -ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match) +ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match) { - if (length == 0) { + if (state.string_position + str.length() > input.view.length()) { + if (str.is_empty()) { + had_zero_length_match = true; + return true; + } + return false; + } + + if (str.length() == 0) { had_zero_length_match = true; return true; } - if (input.view.is_u8_view()) { - auto str_view1 = StringView(str, length); - auto str_view2 = StringView(&input.view.u8view()[state.string_position], length); - - bool string_equals; - if (input.regex_options & AllFlags::Insensitive) - string_equals = str_view1.equals_ignoring_case(str_view2); - else - string_equals = str_view1 == str_view2; - - if (string_equals) { - state.string_position += length; - return true; - } - } else { - bool equals; - if (input.regex_options & AllFlags::Insensitive) - TODO(); - else - equals = __builtin_memcmp(str, &input.view.u32view().code_points()[state.string_position], length) == 0; + auto subject = input.view.substring_view(state.string_position, str.length()); + bool equals; + if (input.regex_options & AllFlags::Insensitive) + equals = subject.equals_ignoring_case(str); + else + equals = subject.equals(str); - if (equals) { - state.string_position += length; - return true; - } - } + if (equals) + state.string_position += str.length(); - return false; + return equals; } ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched) diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index d53ab64462..ba1d3214a2 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -682,7 +682,7 @@ public: private: ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched); - ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match); + ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match); ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); }; diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 285b5e406f..39b22ab565 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -15,6 +15,8 @@ #include <AK/StringBuilder.h> #include <AK/StringView.h> #include <AK/Utf32View.h> +#include <AK/Utf8View.h> +#include <AK/Variant.h> #include <AK/Vector.h> namespace regex { @@ -22,124 +24,172 @@ namespace regex { class RegexStringView { public: RegexStringView(const char* chars) - : m_u8view(chars) + : m_view(StringView { chars }) { } RegexStringView(const String& string) - : m_u8view(string) + : m_view(string.view()) { } RegexStringView(const StringView view) - : m_u8view(view) + : m_view(view) { } - RegexStringView(const Utf32View view) - : m_u32view(view) + + RegexStringView(Utf32View view) + : m_view(view) + { + } + + RegexStringView(Utf8View view) + : m_view(view) { } - bool is_u8_view() const { return m_u8view.has_value(); } - bool is_u32_view() const { return m_u32view.has_value(); } + const StringView& string_view() const + { + return m_view.get<StringView>(); + } - const StringView& u8view() const + const Utf32View& u32_view() const { - VERIFY(m_u8view.has_value()); - return m_u8view.value(); - }; + return m_view.get<Utf32View>(); + } - const Utf32View& u32view() const + const Utf8View& u8_view() const { - VERIFY(m_u32view.has_value()); - return m_u32view.value(); - }; + return m_view.get<Utf8View>(); + } bool is_empty() const { - if (is_u8_view()) - return m_u8view.value().is_empty(); - else - return m_u32view.value().is_empty(); + return m_view.visit([](auto& view) { return view.is_empty(); }); } bool is_null() const { - if (is_u8_view()) - return m_u8view.value().is_null(); - else - return m_u32view.value().code_points() == nullptr; + return m_view.visit([](auto& view) { return view.is_null(); }); } size_t length() const { - if (is_u8_view()) - return m_u8view.value().length(); - else - return m_u32view.value().length(); + return m_view.visit([](auto& view) { return view.length(); }); + } + + RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const + { + return m_view.visit( + [&]<typename T>(T const&) { + StringBuilder builder; + for (auto ch : data) + builder.append(ch); // Note: The type conversion is intentional. + optional_string_storage = builder.build(); + return RegexStringView { T { *optional_string_storage } }; + }, + [&](Utf32View) { + return RegexStringView { Utf32View { data.data(), data.size() } }; + }); } Vector<RegexStringView> lines() const { - if (is_u8_view()) { - auto views = u8view().lines(false); - Vector<RegexStringView> new_views; - for (auto& view : views) - new_views.append(move(view)); - return new_views; - } - - Vector<RegexStringView> views; - auto view = u32view(); - u32 newline = '\n'; - while (!view.is_empty()) { - auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32)); - if (!position.has_value()) - break; - auto offset = position.value() / sizeof(u32); - views.append(view.substring_view(0, offset)); - view = view.substring_view(offset + 1, view.length() - offset - 1); - } - if (!view.is_empty()) - views.append(view); - return views; + return m_view.visit( + [](StringView view) { + auto views = view.lines(false); + Vector<RegexStringView> new_views; + for (auto& view : views) + new_views.empend(view); + return new_views; + }, + [](Utf32View view) { + Vector<RegexStringView> views; + u32 newline = '\n'; + while (!view.is_empty()) { + auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32)); + if (!position.has_value()) + break; + auto offset = position.value() / sizeof(u32); + views.empend(view.substring_view(0, offset)); + view = view.substring_view(offset + 1, view.length() - offset - 1); + } + if (!view.is_empty()) + views.empend(view); + return views; + }, + [](Utf8View& view) { + Vector<RegexStringView> views; + auto it = view.begin(); + auto previous_newline_position_it = it; + for (;;) { + if (*it == '\n') { + auto previous_offset = view.byte_offset_of(previous_newline_position_it); + auto new_offset = view.byte_offset_of(it); + auto slice = view.substring_view(previous_offset, new_offset - previous_offset); + views.empend(slice); + ++it; + previous_newline_position_it = it; + } + if (it.done()) + break; + ++it; + } + if (it != previous_newline_position_it) { + auto previous_offset = view.byte_offset_of(previous_newline_position_it); + auto new_offset = view.byte_offset_of(it); + auto slice = view.substring_view(previous_offset, new_offset - previous_offset); + views.empend(slice); + } + return views; + }); } RegexStringView substring_view(size_t offset, size_t length) const { - if (is_u8_view()) { - return u8view().substring_view(offset, length); - } - return u32view().substring_view(offset, length); + return m_view.visit( + [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, + [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); } String to_string() const { - if (is_u8_view()) { - return u8view().to_string(); - } - - StringBuilder builder; - builder.append(u32view()); - return builder.to_string(); + return m_view.visit( + [](StringView view) { return view.to_string(); }, + [](auto& view) { + StringBuilder builder; + for (auto it = view.begin(); it != view.end(); ++it) + builder.append_code_point(*it); + return builder.to_string(); + }); } u32 operator[](size_t index) const { - if (is_u8_view()) { - i8 ch = u8view()[index]; - u8 value = *reinterpret_cast<u8*>(&ch); - return static_cast<u32>(value); - } - return u32view().code_points()[index]; + return m_view.visit( + [&](StringView view) -> u32 { + auto ch = view[index]; + if (ch < 0) + return 256u + ch; + return ch; + }, + [&](auto view) -> u32 { return view[index]; }, + [&](Utf8View& view) -> u32 { + size_t i = index; + for (auto it = view.begin(); it != view.end(); ++it, --i) { + if (i == 0) + return *it; + } + VERIFY_NOT_REACHED(); + }); } bool operator==(const char* cstring) const { - if (is_u8_view()) - return u8view() == cstring; - - return to_string() == cstring; + return m_view.visit( + [&](Utf32View) { return to_string() == cstring; }, + [&](Utf8View const& view) { return view.as_string() == cstring; }, + [&](StringView view) { return view == cstring; }); } bool operator!=(const char* cstring) const @@ -149,18 +199,18 @@ public: bool operator==(const String& string) const { - if (is_u8_view()) - return u8view() == string; - - return to_string() == string; + return m_view.visit( + [&](Utf32View) { return to_string() == string; }, + [&](Utf8View const& view) { return view.as_string() == string; }, + [&](StringView view) { return view == string; }); } - bool operator==(const StringView& other) const + bool operator==(const StringView& string) const { - if (is_u8_view()) - return u8view() == other; - - return false; + return m_view.visit( + [&](Utf32View) { return to_string() == string; }, + [&](Utf8View const& view) { return view.as_string() == string; }, + [&](StringView view) { return view == string; }); } bool operator!=(const StringView& other) const @@ -170,13 +220,12 @@ public: bool operator==(const Utf32View& other) const { - if (is_u32_view()) { - StringBuilder builder; - builder.append(other); - return to_string() == builder.to_string(); - } - - return false; + return m_view.visit( + [&](Utf32View view) { + return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; + }, + [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, + [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } bool operator!=(const Utf32View& other) const @@ -184,34 +233,78 @@ public: return !(*this == other); } - const char* characters_without_null_termination() const + bool operator==(const Utf8View& other) const + { + return m_view.visit( + [&](Utf32View) { + return to_string() == other.as_string(); + }, + [&](Utf8View const& view) { return view.as_string() == other.as_string(); }, + [&](StringView view) { return other.as_string() == view; }); + } + + bool operator!=(const Utf8View& other) const + { + return !(*this == other); + } + + bool equals(const RegexStringView& other) const { - if (is_u8_view()) - return u8view().characters_without_null_termination(); + return other.m_view.visit([&](auto const& view) { return operator==(view); }); + } - return to_string().characters(); // FIXME: it contains the null termination, does that actually matter? + bool equals_ignoring_case(const RegexStringView& other) const + { + // FIXME: Implement equals_ignoring_case() for unicode. + return m_view.visit( + [&](StringView view) { + return other.m_view.visit( + [&](StringView other_view) { return view.equals_ignoring_case(other_view); }, + [](auto&) -> bool { TODO(); }); + }, + [](auto&) -> bool { TODO(); }); } bool starts_with(const StringView& str) const { - if (is_u32_view()) - return false; - return u8view().starts_with(str); + return m_view.visit( + [&](Utf32View) -> bool { + TODO(); + }, + [&](Utf8View const& view) { return view.as_string().starts_with(str); }, + [&](StringView view) { return view.starts_with(str); }); } bool starts_with(const Utf32View& str) const { - if (is_u8_view()) - return false; - - StringBuilder builder; - builder.append(str); - return to_string().starts_with(builder.to_string()); + return m_view.visit( + [&](Utf32View view) -> bool { + if (str.length() > view.length()) + return false; + if (str.length() == view.length()) + return operator==(str); + for (size_t i = 0; i < str.length(); ++i) { + if (str.at(i) != view.at(i)) + return false; + } + return true; + }, + [&](Utf8View const& view) { + auto it = view.begin(); + for (auto code_point : str) { + if (it.done()) + return false; + if (code_point != *it) + return false; + ++it; + } + return true; + }, + [&](StringView) -> bool { TODO(); }); } private: - Optional<StringView> m_u8view; - Optional<Utf32View> m_u32view; + Variant<StringView, Utf8View, Utf32View> m_view; }; class Match final { @@ -271,6 +364,9 @@ struct MatchState { size_t string_position { 0 }; size_t instruction_position { 0 }; size_t fork_at_position { 0 }; + Vector<Match> matches; + Vector<Vector<Match>> capture_group_matches; + Vector<HashMap<String, Match>> named_capture_group_matches; }; struct MatchOutput { @@ -288,6 +384,7 @@ template<> struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> { void format(FormatBuilder& builder, const regex::RegexStringView& value) { - return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() }); + auto string = value.to_string(); + return Formatter<StringView>::format(builder, string); } }; diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index d08b636f53..fff1487c73 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -99,34 +99,34 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional } if (c_match_preallocation_count) { - output.matches.ensure_capacity(c_match_preallocation_count); - output.capture_group_matches.ensure_capacity(c_match_preallocation_count); - output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count); + state.matches.ensure_capacity(c_match_preallocation_count); + state.capture_group_matches.ensure_capacity(c_match_preallocation_count); + state.named_capture_group_matches.ensure_capacity(c_match_preallocation_count); auto& capture_groups_count = m_pattern.parser_result.capture_groups_count; auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count; for (size_t j = 0; j < c_match_preallocation_count; ++j) { - output.matches.empend(); - output.capture_group_matches.unchecked_append({}); - output.capture_group_matches.at(j).ensure_capacity(capture_groups_count); + state.matches.empend(); + state.capture_group_matches.unchecked_append({}); + state.capture_group_matches.at(j).ensure_capacity(capture_groups_count); for (size_t k = 0; k < capture_groups_count; ++k) - output.capture_group_matches.at(j).unchecked_append({}); + state.capture_group_matches.at(j).unchecked_append({}); - output.named_capture_group_matches.unchecked_append({}); - output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count); + state.named_capture_group_matches.unchecked_append({}); + state.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count); } } - auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) { - if (output.matches.size() == input.match_index) - output.matches.empend(); + auto append_match = [](auto& input, auto& state, auto& start_position) { + if (state.matches.size() == input.match_index) + state.matches.empend(); VERIFY(start_position + state.string_position - start_position <= input.view.length()); if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) { - output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position }; + state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position }; } else { // let the view point to the original string ... - output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position }; + state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position }; } }; @@ -171,7 +171,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional output = move(temp_output); if (!match_count) { // Nothing was *actually* matched, so append an empty match. - append_match(input, state, output, view_index); + append_match(input, state, view_index); ++match_count; } } @@ -219,21 +219,21 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional ++match_count; if (continue_search) { - append_match(input, state, output, view_index); + append_match(input, state, view_index); bool has_zero_length = state.string_position == view_index; view_index = state.string_position - (has_zero_length ? 0 : 1); continue; } else if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { - append_match(input, state, output, view_index); + append_match(input, state, view_index); break; } else if (state.string_position < view_length) { return { false, 0, {}, {}, {}, output.operations }; } - append_match(input, state, output, view_index); + append_match(input, state, view_index); break; } @@ -253,7 +253,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional MatchOutput output_copy; if (match_count) { - output_copy.capture_group_matches = output.capture_group_matches; + output_copy.capture_group_matches = state.capture_group_matches; // Make sure there are as many capture matches as there are actual matches. if (output_copy.capture_group_matches.size() < match_count) output_copy.capture_group_matches.resize(match_count); @@ -264,12 +264,12 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional matches.template remove_all_matching([](auto& match) { return match.view.is_null(); }); } - output_copy.named_capture_group_matches = output.named_capture_group_matches; + output_copy.named_capture_group_matches = state.named_capture_group_matches; // Make sure there are as many capture matches as there are actual matches. if (output_copy.named_capture_group_matches.size() < match_count) output_copy.named_capture_group_matches.resize(match_count); - output_copy.matches = output.matches; + output_copy.matches = state.matches; } else { output_copy.capture_group_matches.clear_with_capacity(); output_copy.named_capture_group_matches.clear_with_capacity(); |