/* * Copyright (c) 2020, Emanuel Sprung * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once #include "RegexOptions.h" #include "AK/FlyString.h" #include "AK/HashMap.h" #include "AK/String.h" #include "AK/StringBuilder.h" #include "AK/StringView.h" #include "AK/Utf32View.h" #include "AK/Vector.h" namespace regex { class RegexStringView { public: RegexStringView(const char* chars) : m_u8view(chars) { } RegexStringView(const String& string) : m_u8view(string) { } RegexStringView(const StringView view) : m_u8view(view) { } RegexStringView(const Utf32View view) : m_u32view(view) { } bool is_u8_view() const { return m_u8view.has_value(); } bool is_u32_view() const { return m_u32view.has_value(); } const StringView& u8view() const { VERIFY(m_u8view.has_value()); return m_u8view.value(); }; const Utf32View& u32view() const { VERIFY(m_u32view.has_value()); return m_u32view.value(); }; bool is_empty() const { if (is_u8_view()) return m_u8view.value().is_empty(); else return m_u32view.value().is_empty(); } bool is_null() const { if (is_u8_view()) return m_u8view.value().is_null(); else return m_u32view.value().code_points() == nullptr; } size_t length() const { if (is_u8_view()) return m_u8view.value().length(); else return m_u32view.value().length(); } Vector lines() const { if (is_u8_view()) { auto views = u8view().lines(false); Vector new_views; for (auto& view : views) new_views.append(move(view)); return new_views; } // FIXME: line splitting for Utf32View needed Vector views; views.append(m_u32view.value()); return views; } RegexStringView substring_view(size_t offset, size_t length) const { if (is_u8_view()) { return u8view().substring_view(offset, length); } return u32view().substring_view(offset, length); } String to_string() const { if (is_u8_view()) { return u8view().to_string(); } StringBuilder builder; builder.append(u32view()); return builder.to_string(); } u32 operator[](size_t index) const { if (is_u8_view()) { i8 ch = u8view()[index]; u8 value = *reinterpret_cast(&ch); return static_cast(value); } return u32view().code_points()[index]; } bool operator==(const char* cstring) const { if (is_u8_view()) return u8view() == cstring; return to_string() == cstring; } bool operator!=(const char* cstring) const { return !(*this == cstring); } bool operator==(const String& string) const { if (is_u8_view()) return u8view() == string; return to_string() == string; } bool operator==(const StringView& other) const { if (is_u8_view()) return u8view() == other; return false; } bool operator!=(const StringView& other) const { return !(*this == other); } bool operator==(const Utf32View& other) const { if (is_u32_view()) { StringBuilder builder; builder.append(other); return to_string() == builder.to_string(); } return false; } bool operator!=(const Utf32View& other) const { return !(*this == other); } const char* characters_without_null_termination() const { if (is_u8_view()) return u8view().characters_without_null_termination(); return to_string().characters(); // FIXME: it contains the null termination, does that actually matter? } bool starts_with(const StringView& str) const { if (is_u32_view()) return false; return u8view().starts_with(str); } bool starts_with(const Utf32View& str) const { if (is_u8_view()) return false; StringBuilder builder; builder.append(str); return to_string().starts_with(builder.to_string()); } private: Optional m_u8view; Optional m_u32view; }; class Match final { private: Optional string; public: Match() = default; ~Match() = default; Match(const RegexStringView view_, const size_t line_, const size_t column_, const size_t global_offset_) : view(view_) , line(line_) , column(column_) , global_offset(global_offset_) , left_column(column_) { } Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_) : string(string_) , view(string.value().view()) , line(line_) , column(column_) , global_offset(global_offset_) , left_column(column_) { } RegexStringView view { nullptr }; size_t line { 0 }; size_t column { 0 }; size_t global_offset { 0 }; // ugly, as not usable by user, but needed to prevent to create extra vectors that are // able to store the column when the left paren has been found size_t left_column { 0 }; }; struct MatchInput { RegexStringView view { nullptr }; AllOptions regex_options {}; size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset. size_t match_index { 0 }; size_t line { 0 }; size_t column { 0 }; size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important mutable size_t fail_counter { 0 }; mutable Vector saved_positions; }; struct MatchState { size_t string_position_before_match { 0 }; size_t string_position { 0 }; size_t instruction_position { 0 }; size_t fork_at_position { 0 }; }; struct MatchOutput { size_t operations; Vector matches; Vector> capture_group_matches; Vector> named_capture_group_matches; }; } using regex::RegexStringView; template<> struct AK::Formatter : Formatter { void format(FormatBuilder& builder, const regex::RegexStringView& value) { return Formatter::format(builder, { value.characters_without_null_termination(), value.length() }); } };