diff options
Diffstat (limited to 'Userland/Libraries/LibRegex/RegexMatch.h')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatch.h | 301 |
1 files changed, 199 insertions, 102 deletions
diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 285b5e406f..39b22ab565 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -15,6 +15,8 @@ #include <AK/StringBuilder.h> #include <AK/StringView.h> #include <AK/Utf32View.h> +#include <AK/Utf8View.h> +#include <AK/Variant.h> #include <AK/Vector.h> namespace regex { @@ -22,124 +24,172 @@ namespace regex { class RegexStringView { public: RegexStringView(const char* chars) - : m_u8view(chars) + : m_view(StringView { chars }) { } RegexStringView(const String& string) - : m_u8view(string) + : m_view(string.view()) { } RegexStringView(const StringView view) - : m_u8view(view) + : m_view(view) { } - RegexStringView(const Utf32View view) - : m_u32view(view) + + RegexStringView(Utf32View view) + : m_view(view) + { + } + + RegexStringView(Utf8View view) + : m_view(view) { } - bool is_u8_view() const { return m_u8view.has_value(); } - bool is_u32_view() const { return m_u32view.has_value(); } + const StringView& string_view() const + { + return m_view.get<StringView>(); + } - const StringView& u8view() const + const Utf32View& u32_view() const { - VERIFY(m_u8view.has_value()); - return m_u8view.value(); - }; + return m_view.get<Utf32View>(); + } - const Utf32View& u32view() const + const Utf8View& u8_view() const { - VERIFY(m_u32view.has_value()); - return m_u32view.value(); - }; + return m_view.get<Utf8View>(); + } bool is_empty() const { - if (is_u8_view()) - return m_u8view.value().is_empty(); - else - return m_u32view.value().is_empty(); + return m_view.visit([](auto& view) { return view.is_empty(); }); } bool is_null() const { - if (is_u8_view()) - return m_u8view.value().is_null(); - else - return m_u32view.value().code_points() == nullptr; + return m_view.visit([](auto& view) { return view.is_null(); }); } size_t length() const { - if (is_u8_view()) - return m_u8view.value().length(); - else - return m_u32view.value().length(); + return m_view.visit([](auto& view) { return view.length(); }); + } + + RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const + { + return m_view.visit( + [&]<typename T>(T const&) { + StringBuilder builder; + for (auto ch : data) + builder.append(ch); // Note: The type conversion is intentional. + optional_string_storage = builder.build(); + return RegexStringView { T { *optional_string_storage } }; + }, + [&](Utf32View) { + return RegexStringView { Utf32View { data.data(), data.size() } }; + }); } Vector<RegexStringView> lines() const { - if (is_u8_view()) { - auto views = u8view().lines(false); - Vector<RegexStringView> new_views; - for (auto& view : views) - new_views.append(move(view)); - return new_views; - } - - Vector<RegexStringView> views; - auto view = u32view(); - u32 newline = '\n'; - while (!view.is_empty()) { - auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32)); - if (!position.has_value()) - break; - auto offset = position.value() / sizeof(u32); - views.append(view.substring_view(0, offset)); - view = view.substring_view(offset + 1, view.length() - offset - 1); - } - if (!view.is_empty()) - views.append(view); - return views; + return m_view.visit( + [](StringView view) { + auto views = view.lines(false); + Vector<RegexStringView> new_views; + for (auto& view : views) + new_views.empend(view); + return new_views; + }, + [](Utf32View view) { + Vector<RegexStringView> views; + u32 newline = '\n'; + while (!view.is_empty()) { + auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32)); + if (!position.has_value()) + break; + auto offset = position.value() / sizeof(u32); + views.empend(view.substring_view(0, offset)); + view = view.substring_view(offset + 1, view.length() - offset - 1); + } + if (!view.is_empty()) + views.empend(view); + return views; + }, + [](Utf8View& view) { + Vector<RegexStringView> views; + auto it = view.begin(); + auto previous_newline_position_it = it; + for (;;) { + if (*it == '\n') { + auto previous_offset = view.byte_offset_of(previous_newline_position_it); + auto new_offset = view.byte_offset_of(it); + auto slice = view.substring_view(previous_offset, new_offset - previous_offset); + views.empend(slice); + ++it; + previous_newline_position_it = it; + } + if (it.done()) + break; + ++it; + } + if (it != previous_newline_position_it) { + auto previous_offset = view.byte_offset_of(previous_newline_position_it); + auto new_offset = view.byte_offset_of(it); + auto slice = view.substring_view(previous_offset, new_offset - previous_offset); + views.empend(slice); + } + return views; + }); } RegexStringView substring_view(size_t offset, size_t length) const { - if (is_u8_view()) { - return u8view().substring_view(offset, length); - } - return u32view().substring_view(offset, length); + return m_view.visit( + [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, + [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); } String to_string() const { - if (is_u8_view()) { - return u8view().to_string(); - } - - StringBuilder builder; - builder.append(u32view()); - return builder.to_string(); + return m_view.visit( + [](StringView view) { return view.to_string(); }, + [](auto& view) { + StringBuilder builder; + for (auto it = view.begin(); it != view.end(); ++it) + builder.append_code_point(*it); + return builder.to_string(); + }); } u32 operator[](size_t index) const { - if (is_u8_view()) { - i8 ch = u8view()[index]; - u8 value = *reinterpret_cast<u8*>(&ch); - return static_cast<u32>(value); - } - return u32view().code_points()[index]; + return m_view.visit( + [&](StringView view) -> u32 { + auto ch = view[index]; + if (ch < 0) + return 256u + ch; + return ch; + }, + [&](auto view) -> u32 { return view[index]; }, + [&](Utf8View& view) -> u32 { + size_t i = index; + for (auto it = view.begin(); it != view.end(); ++it, --i) { + if (i == 0) + return *it; + } + VERIFY_NOT_REACHED(); + }); } bool operator==(const char* cstring) const { - if (is_u8_view()) - return u8view() == cstring; - - return to_string() == cstring; + return m_view.visit( + [&](Utf32View) { return to_string() == cstring; }, + [&](Utf8View const& view) { return view.as_string() == cstring; }, + [&](StringView view) { return view == cstring; }); } bool operator!=(const char* cstring) const @@ -149,18 +199,18 @@ public: bool operator==(const String& string) const { - if (is_u8_view()) - return u8view() == string; - - return to_string() == string; + return m_view.visit( + [&](Utf32View) { return to_string() == string; }, + [&](Utf8View const& view) { return view.as_string() == string; }, + [&](StringView view) { return view == string; }); } - bool operator==(const StringView& other) const + bool operator==(const StringView& string) const { - if (is_u8_view()) - return u8view() == other; - - return false; + return m_view.visit( + [&](Utf32View) { return to_string() == string; }, + [&](Utf8View const& view) { return view.as_string() == string; }, + [&](StringView view) { return view == string; }); } bool operator!=(const StringView& other) const @@ -170,13 +220,12 @@ public: bool operator==(const Utf32View& other) const { - if (is_u32_view()) { - StringBuilder builder; - builder.append(other); - return to_string() == builder.to_string(); - } - - return false; + return m_view.visit( + [&](Utf32View view) { + return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; + }, + [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, + [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } bool operator!=(const Utf32View& other) const @@ -184,34 +233,78 @@ public: return !(*this == other); } - const char* characters_without_null_termination() const + bool operator==(const Utf8View& other) const + { + return m_view.visit( + [&](Utf32View) { + return to_string() == other.as_string(); + }, + [&](Utf8View const& view) { return view.as_string() == other.as_string(); }, + [&](StringView view) { return other.as_string() == view; }); + } + + bool operator!=(const Utf8View& other) const + { + return !(*this == other); + } + + bool equals(const RegexStringView& other) const { - if (is_u8_view()) - return u8view().characters_without_null_termination(); + return other.m_view.visit([&](auto const& view) { return operator==(view); }); + } - return to_string().characters(); // FIXME: it contains the null termination, does that actually matter? + bool equals_ignoring_case(const RegexStringView& other) const + { + // FIXME: Implement equals_ignoring_case() for unicode. + return m_view.visit( + [&](StringView view) { + return other.m_view.visit( + [&](StringView other_view) { return view.equals_ignoring_case(other_view); }, + [](auto&) -> bool { TODO(); }); + }, + [](auto&) -> bool { TODO(); }); } bool starts_with(const StringView& str) const { - if (is_u32_view()) - return false; - return u8view().starts_with(str); + return m_view.visit( + [&](Utf32View) -> bool { + TODO(); + }, + [&](Utf8View const& view) { return view.as_string().starts_with(str); }, + [&](StringView view) { return view.starts_with(str); }); } bool starts_with(const Utf32View& str) const { - if (is_u8_view()) - return false; - - StringBuilder builder; - builder.append(str); - return to_string().starts_with(builder.to_string()); + return m_view.visit( + [&](Utf32View view) -> bool { + if (str.length() > view.length()) + return false; + if (str.length() == view.length()) + return operator==(str); + for (size_t i = 0; i < str.length(); ++i) { + if (str.at(i) != view.at(i)) + return false; + } + return true; + }, + [&](Utf8View const& view) { + auto it = view.begin(); + for (auto code_point : str) { + if (it.done()) + return false; + if (code_point != *it) + return false; + ++it; + } + return true; + }, + [&](StringView) -> bool { TODO(); }); } private: - Optional<StringView> m_u8view; - Optional<Utf32View> m_u32view; + Variant<StringView, Utf8View, Utf32View> m_view; }; class Match final { @@ -271,6 +364,9 @@ struct MatchState { size_t string_position { 0 }; size_t instruction_position { 0 }; size_t fork_at_position { 0 }; + Vector<Match> matches; + Vector<Vector<Match>> capture_group_matches; + Vector<HashMap<String, Match>> named_capture_group_matches; }; struct MatchOutput { @@ -288,6 +384,7 @@ template<> struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> { void format(FormatBuilder& builder, const regex::RegexStringView& value) { - return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() }); + auto string = value.to_string(); + return Formatter<StringView>::format(builder, string); } }; |