summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibRegex/RegexMatch.h
diff options
context:
space:
mode:
Diffstat (limited to 'Userland/Libraries/LibRegex/RegexMatch.h')
-rw-r--r--Userland/Libraries/LibRegex/RegexMatch.h301
1 files changed, 199 insertions, 102 deletions
diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h
index 285b5e406f..39b22ab565 100644
--- a/Userland/Libraries/LibRegex/RegexMatch.h
+++ b/Userland/Libraries/LibRegex/RegexMatch.h
@@ -15,6 +15,8 @@
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Utf32View.h>
+#include <AK/Utf8View.h>
+#include <AK/Variant.h>
#include <AK/Vector.h>
namespace regex {
@@ -22,124 +24,172 @@ namespace regex {
class RegexStringView {
public:
RegexStringView(const char* chars)
- : m_u8view(chars)
+ : m_view(StringView { chars })
{
}
RegexStringView(const String& string)
- : m_u8view(string)
+ : m_view(string.view())
{
}
RegexStringView(const StringView view)
- : m_u8view(view)
+ : m_view(view)
{
}
- RegexStringView(const Utf32View view)
- : m_u32view(view)
+
+ RegexStringView(Utf32View view)
+ : m_view(view)
+ {
+ }
+
+ RegexStringView(Utf8View view)
+ : m_view(view)
{
}
- bool is_u8_view() const { return m_u8view.has_value(); }
- bool is_u32_view() const { return m_u32view.has_value(); }
+ const StringView& string_view() const
+ {
+ return m_view.get<StringView>();
+ }
- const StringView& u8view() const
+ const Utf32View& u32_view() const
{
- VERIFY(m_u8view.has_value());
- return m_u8view.value();
- };
+ return m_view.get<Utf32View>();
+ }
- const Utf32View& u32view() const
+ const Utf8View& u8_view() const
{
- VERIFY(m_u32view.has_value());
- return m_u32view.value();
- };
+ return m_view.get<Utf8View>();
+ }
bool is_empty() const
{
- if (is_u8_view())
- return m_u8view.value().is_empty();
- else
- return m_u32view.value().is_empty();
+ return m_view.visit([](auto& view) { return view.is_empty(); });
}
bool is_null() const
{
- if (is_u8_view())
- return m_u8view.value().is_null();
- else
- return m_u32view.value().code_points() == nullptr;
+ return m_view.visit([](auto& view) { return view.is_null(); });
}
size_t length() const
{
- if (is_u8_view())
- return m_u8view.value().length();
- else
- return m_u32view.value().length();
+ return m_view.visit([](auto& view) { return view.length(); });
+ }
+
+ RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const
+ {
+ return m_view.visit(
+ [&]<typename T>(T const&) {
+ StringBuilder builder;
+ for (auto ch : data)
+ builder.append(ch); // Note: The type conversion is intentional.
+ optional_string_storage = builder.build();
+ return RegexStringView { T { *optional_string_storage } };
+ },
+ [&](Utf32View) {
+ return RegexStringView { Utf32View { data.data(), data.size() } };
+ });
}
Vector<RegexStringView> lines() const
{
- if (is_u8_view()) {
- auto views = u8view().lines(false);
- Vector<RegexStringView> new_views;
- for (auto& view : views)
- new_views.append(move(view));
- return new_views;
- }
-
- Vector<RegexStringView> views;
- auto view = u32view();
- u32 newline = '\n';
- while (!view.is_empty()) {
- auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
- if (!position.has_value())
- break;
- auto offset = position.value() / sizeof(u32);
- views.append(view.substring_view(0, offset));
- view = view.substring_view(offset + 1, view.length() - offset - 1);
- }
- if (!view.is_empty())
- views.append(view);
- return views;
+ return m_view.visit(
+ [](StringView view) {
+ auto views = view.lines(false);
+ Vector<RegexStringView> new_views;
+ for (auto& view : views)
+ new_views.empend(view);
+ return new_views;
+ },
+ [](Utf32View view) {
+ Vector<RegexStringView> views;
+ u32 newline = '\n';
+ while (!view.is_empty()) {
+ auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
+ if (!position.has_value())
+ break;
+ auto offset = position.value() / sizeof(u32);
+ views.empend(view.substring_view(0, offset));
+ view = view.substring_view(offset + 1, view.length() - offset - 1);
+ }
+ if (!view.is_empty())
+ views.empend(view);
+ return views;
+ },
+ [](Utf8View& view) {
+ Vector<RegexStringView> views;
+ auto it = view.begin();
+ auto previous_newline_position_it = it;
+ for (;;) {
+ if (*it == '\n') {
+ auto previous_offset = view.byte_offset_of(previous_newline_position_it);
+ auto new_offset = view.byte_offset_of(it);
+ auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
+ views.empend(slice);
+ ++it;
+ previous_newline_position_it = it;
+ }
+ if (it.done())
+ break;
+ ++it;
+ }
+ if (it != previous_newline_position_it) {
+ auto previous_offset = view.byte_offset_of(previous_newline_position_it);
+ auto new_offset = view.byte_offset_of(it);
+ auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
+ views.empend(slice);
+ }
+ return views;
+ });
}
RegexStringView substring_view(size_t offset, size_t length) const
{
- if (is_u8_view()) {
- return u8view().substring_view(offset, length);
- }
- return u32view().substring_view(offset, length);
+ return m_view.visit(
+ [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
+ [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
}
String to_string() const
{
- if (is_u8_view()) {
- return u8view().to_string();
- }
-
- StringBuilder builder;
- builder.append(u32view());
- return builder.to_string();
+ return m_view.visit(
+ [](StringView view) { return view.to_string(); },
+ [](auto& view) {
+ StringBuilder builder;
+ for (auto it = view.begin(); it != view.end(); ++it)
+ builder.append_code_point(*it);
+ return builder.to_string();
+ });
}
u32 operator[](size_t index) const
{
- if (is_u8_view()) {
- i8 ch = u8view()[index];
- u8 value = *reinterpret_cast<u8*>(&ch);
- return static_cast<u32>(value);
- }
- return u32view().code_points()[index];
+ return m_view.visit(
+ [&](StringView view) -> u32 {
+ auto ch = view[index];
+ if (ch < 0)
+ return 256u + ch;
+ return ch;
+ },
+ [&](auto view) -> u32 { return view[index]; },
+ [&](Utf8View& view) -> u32 {
+ size_t i = index;
+ for (auto it = view.begin(); it != view.end(); ++it, --i) {
+ if (i == 0)
+ return *it;
+ }
+ VERIFY_NOT_REACHED();
+ });
}
bool operator==(const char* cstring) const
{
- if (is_u8_view())
- return u8view() == cstring;
-
- return to_string() == cstring;
+ return m_view.visit(
+ [&](Utf32View) { return to_string() == cstring; },
+ [&](Utf8View const& view) { return view.as_string() == cstring; },
+ [&](StringView view) { return view == cstring; });
}
bool operator!=(const char* cstring) const
@@ -149,18 +199,18 @@ public:
bool operator==(const String& string) const
{
- if (is_u8_view())
- return u8view() == string;
-
- return to_string() == string;
+ return m_view.visit(
+ [&](Utf32View) { return to_string() == string; },
+ [&](Utf8View const& view) { return view.as_string() == string; },
+ [&](StringView view) { return view == string; });
}
- bool operator==(const StringView& other) const
+ bool operator==(const StringView& string) const
{
- if (is_u8_view())
- return u8view() == other;
-
- return false;
+ return m_view.visit(
+ [&](Utf32View) { return to_string() == string; },
+ [&](Utf8View const& view) { return view.as_string() == string; },
+ [&](StringView view) { return view == string; });
}
bool operator!=(const StringView& other) const
@@ -170,13 +220,12 @@ public:
bool operator==(const Utf32View& other) const
{
- if (is_u32_view()) {
- StringBuilder builder;
- builder.append(other);
- return to_string() == builder.to_string();
- }
-
- return false;
+ return m_view.visit(
+ [&](Utf32View view) {
+ return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
+ },
+ [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
+ [&](StringView view) { return view == RegexStringView { other }.to_string(); });
}
bool operator!=(const Utf32View& other) const
@@ -184,34 +233,78 @@ public:
return !(*this == other);
}
- const char* characters_without_null_termination() const
+ bool operator==(const Utf8View& other) const
+ {
+ return m_view.visit(
+ [&](Utf32View) {
+ return to_string() == other.as_string();
+ },
+ [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
+ [&](StringView view) { return other.as_string() == view; });
+ }
+
+ bool operator!=(const Utf8View& other) const
+ {
+ return !(*this == other);
+ }
+
+ bool equals(const RegexStringView& other) const
{
- if (is_u8_view())
- return u8view().characters_without_null_termination();
+ return other.m_view.visit([&](auto const& view) { return operator==(view); });
+ }
- return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
+ bool equals_ignoring_case(const RegexStringView& other) const
+ {
+ // FIXME: Implement equals_ignoring_case() for unicode.
+ return m_view.visit(
+ [&](StringView view) {
+ return other.m_view.visit(
+ [&](StringView other_view) { return view.equals_ignoring_case(other_view); },
+ [](auto&) -> bool { TODO(); });
+ },
+ [](auto&) -> bool { TODO(); });
}
bool starts_with(const StringView& str) const
{
- if (is_u32_view())
- return false;
- return u8view().starts_with(str);
+ return m_view.visit(
+ [&](Utf32View) -> bool {
+ TODO();
+ },
+ [&](Utf8View const& view) { return view.as_string().starts_with(str); },
+ [&](StringView view) { return view.starts_with(str); });
}
bool starts_with(const Utf32View& str) const
{
- if (is_u8_view())
- return false;
-
- StringBuilder builder;
- builder.append(str);
- return to_string().starts_with(builder.to_string());
+ return m_view.visit(
+ [&](Utf32View view) -> bool {
+ if (str.length() > view.length())
+ return false;
+ if (str.length() == view.length())
+ return operator==(str);
+ for (size_t i = 0; i < str.length(); ++i) {
+ if (str.at(i) != view.at(i))
+ return false;
+ }
+ return true;
+ },
+ [&](Utf8View const& view) {
+ auto it = view.begin();
+ for (auto code_point : str) {
+ if (it.done())
+ return false;
+ if (code_point != *it)
+ return false;
+ ++it;
+ }
+ return true;
+ },
+ [&](StringView) -> bool { TODO(); });
}
private:
- Optional<StringView> m_u8view;
- Optional<Utf32View> m_u32view;
+ Variant<StringView, Utf8View, Utf32View> m_view;
};
class Match final {
@@ -271,6 +364,9 @@ struct MatchState {
size_t string_position { 0 };
size_t instruction_position { 0 };
size_t fork_at_position { 0 };
+ Vector<Match> matches;
+ Vector<Vector<Match>> capture_group_matches;
+ Vector<HashMap<String, Match>> named_capture_group_matches;
};
struct MatchOutput {
@@ -288,6 +384,7 @@ template<>
struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
void format(FormatBuilder& builder, const regex::RegexStringView& value)
{
- return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() });
+ auto string = value.to_string();
+ return Formatter<StringView>::format(builder, string);
}
};