/* * Copyright (c) 2020, Emanuel Sprung * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once #include "Forward.h" #include "RegexOptions.h" #include #include #include #include #include #include #include #include #include #include #include namespace regex { class RegexStringView { public: RegexStringView(char const* chars) : m_view(StringView { chars }) { } RegexStringView(String const& string) : m_view(string.view()) { } RegexStringView(StringView const view) : m_view(view) { } RegexStringView(Utf32View view) : m_view(view) { } RegexStringView(Utf16View view) : m_view(view) { } RegexStringView(Utf8View view) : m_view(view) { } explicit RegexStringView(String&&) = delete; StringView string_view() const { return m_view.get(); } Utf32View const& u32_view() const { return m_view.get(); } Utf16View const& u16_view() const { return m_view.get(); } Utf8View const& u8_view() const { return m_view.get(); } bool unicode() const { return m_unicode; } void set_unicode(bool unicode) { m_unicode = unicode; } bool is_empty() const { return m_view.visit([](auto& view) { return view.is_empty(); }); } bool is_null() const { return m_view.visit([](auto& view) { return view.is_null(); }); } size_t length() const { if (unicode()) { return m_view.visit( [](Utf16View const& view) { return view.length_in_code_points(); }, [](auto const& view) { return view.length(); }); } return length_in_code_units(); } size_t length_in_code_units() const { return m_view.visit( [](Utf16View const& view) { return view.length_in_code_units(); }, [](Utf8View const& view) { return view.byte_length(); }, [](auto const& view) { return view.length(); }); } size_t length_of_code_point(u32 code_point) const { return m_view.visit( [](Utf32View const&) { return 1; }, [&](Utf16View const&) { if (code_point < 0x10000) return 1; return 2; }, [&](auto const&) { if (code_point <= 0x7f) return 1; if (code_point <= 0x07ff) return 2; if (code_point <= 0xffff) return 3; return 4; }); } RegexStringView typed_null_view() { auto view = m_view.visit( [&](T const&) { return RegexStringView { T {} }; }); view.set_unicode(unicode()); return view; } RegexStringView construct_as_same(Span data, Optional& optional_string_storage, Vector& optional_utf16_storage) const { auto view = m_view.visit( [&](T const&) { StringBuilder builder; for (auto ch : data) builder.append(ch); // Note: The type conversion is intentional. optional_string_storage = builder.build(); return RegexStringView { T { *optional_string_storage } }; }, [&](Utf32View) { return RegexStringView { Utf32View { data.data(), data.size() } }; }, [&](Utf16View) { optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }); return RegexStringView { Utf16View { optional_utf16_storage } }; }); view.set_unicode(unicode()); return view; } Vector lines() const { return m_view.visit( [](StringView view) { auto views = view.lines(false); Vector new_views; for (auto& view : views) new_views.empend(view); return new_views; }, [](Utf32View view) { if (view.is_empty()) return Vector { view }; Vector views; u32 newline = '\n'; while (!view.is_empty()) { auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32)); if (!position.has_value()) break; auto offset = position.value() / sizeof(u32); views.empend(view.substring_view(0, offset)); view = view.substring_view(offset + 1, view.length() - offset - 1); } if (!view.is_empty()) views.empend(view); return views; }, [](Utf16View view) { if (view.is_empty()) return Vector { view }; Vector views; u16 newline = '\n'; while (!view.is_empty()) { auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16)); if (!position.has_value()) break; auto offset = position.value() / sizeof(u16); views.empend(view.substring_view(0, offset)); view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1); } if (!view.is_empty()) views.empend(view); return views; }, [](Utf8View const& view) { if (view.is_empty()) return Vector { view }; Vector views; auto it = view.begin(); auto previous_newline_position_it = it; for (;;) { if (*it == '\n') { auto previous_offset = view.byte_offset_of(previous_newline_position_it); auto new_offset = view.byte_offset_of(it); auto slice = view.substring_view(previous_offset, new_offset - previous_offset); views.empend(slice); ++it; previous_newline_position_it = it; } if (it.done()) break; ++it; } if (it != previous_newline_position_it) { auto previous_offset = view.byte_offset_of(previous_newline_position_it); auto new_offset = view.byte_offset_of(it); auto slice = view.substring_view(previous_offset, new_offset - previous_offset); views.empend(slice); } return views; }); } RegexStringView substring_view(size_t offset, size_t length) const { if (unicode()) { auto view = m_view.visit( [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }, [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); view.set_unicode(unicode()); return view; } auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }); view.set_unicode(unicode()); return view; } String to_string() const { return m_view.visit( [](StringView view) { return view.to_string(); }, [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); }, [](auto& view) { StringBuilder builder; for (auto it = view.begin(); it != view.end(); ++it) builder.append_code_point(*it); return builder.to_string(); }); } // Note: index must always be the code unit offset to return. u32 operator[](size_t index) const { return m_view.visit( [&](StringView view) -> u32 { auto ch = view[index]; if (ch < 0) return 256u + ch; return ch; }, [&](Utf32View const& view) -> u32 { return view[index]; }, [&](Utf16View const& view) -> u32 { return view.code_point_at(index); }, [&](Utf8View const& view) -> u32 { auto it = view.iterator_at_byte_offset(index); VERIFY(it != view.end()); return *it; }); } size_t code_unit_offset_of(size_t code_point_index) const { return m_view.visit( [&](StringView view) -> u32 { Utf8View utf8_view { view }; return utf8_view.byte_offset_of(code_point_index); }, [&](Utf32View const&) -> u32 { return code_point_index; }, [&](Utf16View const& view) -> u32 { return view.code_unit_offset_of(code_point_index); }, [&](Utf8View const& view) -> u32 { return view.byte_offset_of(code_point_index); }); } bool operator==(char const* cstring) const { return m_view.visit( [&](Utf32View) { return to_string() == cstring; }, [&](Utf16View) { return to_string() == cstring; }, [&](Utf8View const& view) { return view.as_string() == cstring; }, [&](StringView view) { return view == cstring; }); } bool operator!=(char const* cstring) const { return !(*this == cstring); } bool operator==(String const& string) const { return m_view.visit( [&](Utf32View) { return to_string() == string; }, [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } bool operator==(StringView string) const { return m_view.visit( [&](Utf32View) { return to_string() == string; }, [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } bool operator!=(StringView other) const { return !(*this == other); } bool operator==(Utf32View const& other) const { return m_view.visit( [&](Utf32View view) { return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; }, [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); }, [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } bool operator!=(Utf32View const& other) const { return !(*this == other); } bool operator==(Utf16View const& other) const { return m_view.visit( [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); }, [&](Utf16View const& view) { return view == other; }, [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } bool operator!=(Utf16View const& other) const { return !(*this == other); } bool operator==(Utf8View const& other) const { return m_view.visit( [&](Utf32View) { return to_string() == other.as_string(); }, [&](Utf16View) { return to_string() == other.as_string(); }, [&](Utf8View const& view) { return view.as_string() == other.as_string(); }, [&](StringView view) { return other.as_string() == view; }); } bool operator!=(Utf8View const& other) const { return !(*this == other); } bool equals(RegexStringView other) const { return other.m_view.visit([this](auto const& view) { return operator==(view); }); } bool equals_ignoring_case(RegexStringView other) const { // FIXME: Implement equals_ignoring_case() for unicode. return m_view.visit( [&](StringView view) { return other.m_view.visit( [&](StringView other_view) { return view.equals_ignoring_case(other_view); }, [](auto&) -> bool { TODO(); }); }, [&](Utf16View view) { return other.m_view.visit( [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); }, [](auto&) -> bool { TODO(); }); }, [](auto&) -> bool { TODO(); }); } bool starts_with(StringView str) const { return m_view.visit( [&](Utf32View) -> bool { TODO(); }, [&](Utf16View) -> bool { TODO(); }, [&](Utf8View const& view) { return view.as_string().starts_with(str); }, [&](StringView view) { return view.starts_with(str); }); } bool starts_with(Utf32View const& str) const { return m_view.visit( [&](Utf32View view) -> bool { if (str.length() > view.length()) return false; if (str.length() == view.length()) return operator==(str); for (size_t i = 0; i < str.length(); ++i) { if (str.at(i) != view.at(i)) return false; } return true; }, [&](Utf16View) -> bool { TODO(); }, [&](Utf8View const& view) { auto it = view.begin(); for (auto code_point : str) { if (it.done()) return false; if (code_point != *it) return false; ++it; } return true; }, [&](StringView) -> bool { TODO(); }); } private: Variant m_view; bool m_unicode { false }; }; class Match final { private: Optional string; public: Match() = default; ~Match() = default; Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_) : view(view_) , line(line_) , column(column_) , global_offset(global_offset_) , left_column(column_) { } Match(String string_, size_t const line_, size_t const column_, size_t const global_offset_) : string(move(string_)) , view(string.value().view()) , line(line_) , column(column_) , global_offset(global_offset_) { } Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_) : view(view_) , capture_group_name(capture_group_name_) , line(line_) , column(column_) , global_offset(global_offset_) , left_column(column_) { } void reset() { view = view.typed_null_view(); capture_group_name.clear(); line = 0; column = 0; global_offset = 0; left_column = 0; } RegexStringView view { nullptr }; Optional capture_group_name {}; size_t line { 0 }; size_t column { 0 }; size_t global_offset { 0 }; // ugly, as not usable by user, but needed to prevent to create extra vectors that are // able to store the column when the left paren has been found size_t left_column { 0 }; }; struct MatchInput { RegexStringView view { nullptr }; AllOptions regex_options {}; size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset. size_t match_index { 0 }; size_t line { 0 }; size_t column { 0 }; size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important mutable size_t fail_counter { 0 }; mutable Vector saved_positions; mutable Vector saved_code_unit_positions; mutable Vector saved_forks_since_last_save; mutable HashMap checkpoints; mutable Optional fork_to_replace; }; struct MatchState { size_t string_position_before_match { 0 }; size_t string_position { 0 }; size_t string_position_in_code_units { 0 }; size_t instruction_position { 0 }; size_t fork_at_position { 0 }; size_t forks_since_last_save { 0 }; Optional initiating_fork; Vector matches; Vector> capture_group_matches; Vector repetition_marks; }; } using regex::RegexStringView; template<> struct AK::Formatter : Formatter { ErrorOr format(FormatBuilder& builder, regex::RegexStringView value) { auto string = value.to_string(); return Formatter::format(builder, string); } };