4 files changed, 123 insertions, 20 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index 677ad0cb0d..f5869acbdb 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -465,12 +465,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             Optional<String> str;
+            Vector<u16> utf16;
             Vector<u32> data;
             data.ensure_capacity(length);
             for (size_t i = offset; i < offset + length; ++i)
                 data.unchecked_append(m_bytecode->at(i));
 
-            auto view = input.view.construct_as_same(data, str);
+            auto view = input.view.construct_as_same(data, str, utf16);
             offset += length;
             if (!compare_string(input, state, view, had_zero_length_match))
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
@@ -553,7 +554,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
 
     auto input_view = input.view.substring_view(state.string_position, 1);
     Optional<String> str;
-    auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str);
+    Vector<u16> utf16;
+    auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16);
     bool equal;
     if (input.regex_options & AllFlags::Insensitive)
         equal = input_view.equals_ignoring_case(compare_view);
diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h
index b58dc5e132..6bc58ad78f 100644
--- a/Userland/Libraries/LibRegex/RegexMatch.h
+++ b/Userland/Libraries/LibRegex/RegexMatch.h
@@ -14,6 +14,7 @@
 #include <AK/String.h>
 #include <AK/StringBuilder.h>
 #include <AK/StringView.h>
+#include <AK/Utf16View.h>
 #include <AK/Utf32View.h>
 #include <AK/Utf8View.h>
 #include <AK/Variant.h>
@@ -43,6 +44,11 @@ public:
     {
     }
 
+    RegexStringView(Utf16View view)
+        : m_view(view)
+    {
+    }
+
     RegexStringView(Utf8View view)
         : m_view(view)
     {
@@ -58,11 +64,19 @@ public:
         return m_view.get<Utf32View>();
     }
 
+    Utf16View const& u16_view() const
+    {
+        return m_view.get<Utf16View>();
+    }
+
     Utf8View const& u8_view() const
     {
         return m_view.get<Utf8View>();
     }
 
+    bool unicode() const { return m_unicode; }
+    void set_unicode(bool unicode) { m_unicode = unicode; }
+
     bool is_empty() const
     {
         return m_view.visit([](auto& view) { return view.is_empty(); });
@@ -75,12 +89,21 @@ public:
 
     size_t length() const
     {
-        return m_view.visit([](auto& view) { return view.length(); });
+        if (unicode()) {
+            return m_view.visit(
+                [](Utf16View const& view) { return view.length_in_code_points(); },
+                [](auto const& view) { return view.length(); });
+        }
+
+        return m_view.visit(
+            [](Utf16View const& view) { return view.length_in_code_units(); },
+            [](Utf8View const& view) { return view.byte_length(); },
+            [](auto const& view) { return view.length(); });
     }
 
-    RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const
+    RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16>& optional_utf16_storage) const
     {
-        return m_view.visit(
+        auto view = m_view.visit(
             [&]<typename T>(T const&) {
                 StringBuilder builder;
                 for (auto ch : data)
@@ -90,7 +113,14 @@ public:
             },
             [&](Utf32View) {
                 return RegexStringView { Utf32View { data.data(), data.size() } };
+            },
+            [&](Utf16View) {
+                optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
+                return RegexStringView { Utf16View { optional_utf16_storage } };
             });
+
+        view.set_unicode(unicode());
+        return view;
     }
 
     Vector<RegexStringView> lines() const
@@ -118,6 +148,21 @@ public:
                     views.empend(view);
                 return views;
             },
+            [](Utf16View view) {
+                Vector<RegexStringView> views;
+                u16 newline = '\n';
+                while (!view.is_empty()) {
+                    auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
+                    if (!position.has_value())
+                        break;
+                    auto offset = position.value() / sizeof(u16);
+                    views.empend(view.substring_view(0, offset));
+                    view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
+                }
+                if (!view.is_empty())
+                    views.empend(view);
+                return views;
+            },
             [](Utf8View& view) {
                 Vector<RegexStringView> views;
                 auto it = view.begin();
@@ -147,15 +192,26 @@ public:
 
     RegexStringView substring_view(size_t offset, size_t length) const
     {
-        return m_view.visit(
-            [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
-            [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
+        if (unicode()) {
+            auto view = m_view.visit(
+                [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
+                [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
+                [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
+
+            view.set_unicode(unicode());
+            return view;
+        }
+
+        auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
+        view.set_unicode(unicode());
+        return view;
     }
 
     String to_string() const
     {
         return m_view.visit(
             [](StringView view) { return view.to_string(); },
+            [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
             [](auto& view) {
                 StringBuilder builder;
                 for (auto it = view.begin(); it != view.end(); ++it)
@@ -173,8 +229,8 @@ public:
                     return 256u + ch;
                 return ch;
             },
-            [&](auto view) -> u32 { return view[index]; },
-            [&](Utf8View& view) -> u32 {
+            [&](Utf32View& view) -> u32 { return view[index]; },
+            [&](auto& view) -> u32 {
                 size_t i = index;
                 for (auto it = view.begin(); it != view.end(); ++it, --i) {
                     if (i == 0)
@@ -188,6 +244,7 @@ public:
     {
         return m_view.visit(
             [&](Utf32View) { return to_string() == cstring; },
+            [&](Utf16View) { return to_string() == cstring; },
             [&](Utf8View const& view) { return view.as_string() == cstring; },
             [&](StringView view) { return view == cstring; });
     }
@@ -201,6 +258,7 @@ public:
     {
         return m_view.visit(
             [&](Utf32View) { return to_string() == string; },
+            [&](Utf16View) { return to_string() == string; },
             [&](Utf8View const& view) { return view.as_string() == string; },
             [&](StringView view) { return view == string; });
     }
@@ -209,6 +267,7 @@ public:
     {
         return m_view.visit(
             [&](Utf32View) { return to_string() == string; },
+            [&](Utf16View) { return to_string() == string; },
             [&](Utf8View const& view) { return view.as_string() == string; },
             [&](StringView view) { return view == string; });
     }
@@ -224,6 +283,7 @@ public:
             [&](Utf32View view) {
                 return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
             },
+            [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
             [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
             [&](StringView view) { return view == RegexStringView { other }.to_string(); });
     }
@@ -233,12 +293,25 @@ public:
         return !(*this == other);
     }
 
+    bool operator==(Utf16View const& other) const
+    {
+        return m_view.visit(
+            [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
+            [&](Utf16View const& view) { return view == other; },
+            [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
+            [&](StringView view) { return view == RegexStringView { other }.to_string(); });
+    }
+
+    bool operator!=(Utf16View const& other) const
+    {
+        return !(*this == other);
+    }
+
     bool operator==(Utf8View const& other) const
     {
         return m_view.visit(
-            [&](Utf32View) {
-                return to_string() == other.as_string();
-            },
+            [&](Utf32View) { return to_string() == other.as_string(); },
+            [&](Utf16View) { return to_string() == other.as_string(); },
             [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
             [&](StringView view) { return other.as_string() == view; });
     }
@@ -271,6 +344,9 @@ public:
             [&](Utf32View) -> bool {
                 TODO();
             },
+            [&](Utf16View) -> bool {
+                TODO();
+            },
             [&](Utf8View const& view) { return view.as_string().starts_with(str); },
             [&](StringView view) { return view.starts_with(str); });
     }
@@ -289,6 +365,7 @@ public:
                 }
                 return true;
             },
+            [&](Utf16View) -> bool { TODO(); },
             [&](Utf8View const& view) {
                 auto it = view.begin();
                 for (auto code_point : str) {
@@ -304,7 +381,8 @@ public:
     }
 
 private:
-    Variant<StringView, Utf8View, Utf32View> m_view;
+    Variant<StringView, Utf8View, Utf16View, Utf32View> m_view;
+    bool m_unicode { false };
 };
 
 class Match final {
diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp
index 60783b25f6..f4a848741a 100644
--- a/Userland/Libraries/LibRegex/RegexMatcher.cpp
+++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp
@@ -84,6 +84,10 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional
     output.operations = 0;
     size_t lines_to_skip = 0;
 
+    bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
+    for (auto& view : views)
+        const_cast<RegexStringView&>(view).set_unicode(unicode);
+
     if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
         if (views.size() > 1 && input.start_offset > views.first().length()) {
             dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 12c62fef78..07885173b6 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -10,6 +10,7 @@
 #include <AK/String.h>
 #include <AK/StringBuilder.h>
 #include <AK/StringUtils.h>
+#include <AK/Utf16View.h>
 
 namespace regex {
 
@@ -1440,13 +1441,31 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
 
     if (try_skip("u")) {
         if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
-            // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
+            // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
+            // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
+            // but doesn't form a valid surrogate pair, insert bytecode for both code units individually.
+            Optional<u32> low_surrogate;
+            if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) {
+                low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4);
+                if (!low_surrogate.has_value()) {
+                    set_error(Error::InvalidPattern);
+                    return false;
+                }
+
+                if (Utf16View::is_low_surrogate(*low_surrogate)) {
+                    *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate);
+                    low_surrogate.clear();
+                }
+            }
+
             match_length_minimum += 1;
-            StringBuilder builder;
-            builder.append_code_point(code_point.value());
-            // FIXME: This isn't actually correct for ECMAScript.
-            auto u8_encoded = builder.string_view();
-            stack.insert_bytecode_compare_string(u8_encoded);
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
+
+            if (low_surrogate.has_value()) {
+                match_length_minimum += 1;
+                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } });
+            }
+
             return true;
         } else if (!unicode) {
             // '\u' is allowed in non-unicode mode, just matches 'u'.