LibRegex: Support UTF-16 RegexStringView and improve Unicode matching

When the Unicode option is not set, regular expressions should match based on code units; when it is set, they should match based on code points. To do so, the regex parser must combine surrogate pairs when the Unicode option is set. Further, RegexStringView needs to know if the flag is set in order to return code point vs. code unit based string lengths and substrings.
author: Timothy Flynn <trflynn89@pm.me> 2021-07-20 22:33:00 -0400
committer: Linus Groh <mail@linusgroh.de> 2021-07-23 23:06:57 +0100
commit: 47f6bb38a1bd3c39324d11b4eec1d8d8993658a2 (patch)
tree: b6d3f75b7964cf50f42fb99930e3854abbc5fb2f /Tests/LibRegex
parent: 2e45e52993a47b7aa6d61aed7298fb78ea746695 (diff)
download: serenity-47f6bb38a1bd3c39324d11b4eec1d8d8993658a2.zip
1 files changed, 44 insertions, 1 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp
index 1643acd37b..a4731e62d3 100644
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@@ -506,10 +506,14 @@ TEST_CASE(ECMA262_parse)
         { ",(?", regex::Error::InvalidCaptureGroup }, // #4583
         { "{1}", regex::Error::InvalidPattern },
         { "{1,2}", regex::Error::InvalidPattern },
+        { "\\uxxxx", regex::Error::NoError },
+        { "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode },
+        { "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
     };
 
     for (auto& test : tests) {
-        Regex<ECMA262> re(test.pattern);
+        Regex<ECMA262> re(test.pattern, test.flags);
         EXPECT_EQ(re.parser_result.error, test.expected_error);
         if constexpr (REGEX_DEBUG) {
             dbgln("\n");
@@ -586,6 +590,45 @@ TEST_CASE(ECMA262_match)
     }
 }
 
+TEST_CASE(ECMA262_unicode_match)
+{
+    struct _test {
+        char const* pattern;
+        char const* subject;
+        bool matches { true };
+        ECMAScriptFlags options {};
+    };
+    _test tests[] {
+        { "\\ud83d", "😀", true },
+        { "\\ud83d", "😀", false, ECMAScriptFlags::Unicode },
+        { "\\ude00", "😀", true },
+        { "\\ude00", "😀", false, ECMAScriptFlags::Unicode },
+        { "\\ud83d\\ude00", "😀", true },
+        { "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode },
+        { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true },
+        { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode },
+    };
+
+    for (auto& test : tests) {
+        Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
+
+        auto subject = AK::utf8_to_utf16(test.subject);
+        Utf16View view { subject };
+
+        if constexpr (REGEX_DEBUG) {
+            dbgln("\n");
+            RegexDebug regex_dbg(stderr);
+            regex_dbg.print_raw_bytecode(re);
+            regex_dbg.print_header();
+            regex_dbg.print_bytecode(re);
+            dbgln("\n");
+        }
+
+        EXPECT_EQ(re.parser_result.error, Error::NoError);
+        EXPECT_EQ(re.match(view).success, test.matches);
+    }
+}
+
 TEST_CASE(replace)
 {
     struct _test {
author	Timothy Flynn <trflynn89@pm.me>	2021-07-20 22:33:00 -0400
committer	Linus Groh <mail@linusgroh.de>	2021-07-23 23:06:57 +0100
commit	47f6bb38a1bd3c39324d11b4eec1d8d8993658a2 (patch)
tree	b6d3f75b7964cf50f42fb99930e3854abbc5fb2f /Tests/LibRegex
parent	2e45e52993a47b7aa6d61aed7298fb78ea746695 (diff)
download	serenity-47f6bb38a1bd3c39324d11b4eec1d8d8993658a2.zip