diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-07-20 22:33:00 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-07-23 23:06:57 +0100 |
commit | 47f6bb38a1bd3c39324d11b4eec1d8d8993658a2 (patch) | |
tree | b6d3f75b7964cf50f42fb99930e3854abbc5fb2f /Tests/LibRegex | |
parent | 2e45e52993a47b7aa6d61aed7298fb78ea746695 (diff) | |
download | serenity-47f6bb38a1bd3c39324d11b4eec1d8d8993658a2.zip |
LibRegex: Support UTF-16 RegexStringView and improve Unicode matching
When the Unicode option is not set, regular expressions should match
based on code units; when it is set, they should match based on code
points. To do so, the regex parser must combine surrogate pairs when
the Unicode option is set. Further, RegexStringView needs to know if
the flag is set in order to return code point vs. code unit based
string lengths and substrings.
Diffstat (limited to 'Tests/LibRegex')
-rw-r--r-- | Tests/LibRegex/Regex.cpp | 45 |
1 files changed, 44 insertions, 1 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 1643acd37b..a4731e62d3 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -506,10 +506,14 @@ TEST_CASE(ECMA262_parse) { ",(?", regex::Error::InvalidCaptureGroup }, // #4583 { "{1}", regex::Error::InvalidPattern }, { "{1,2}", regex::Error::InvalidPattern }, + { "\\uxxxx", regex::Error::NoError }, + { "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { - Regex<ECMA262> re(test.pattern); + Regex<ECMA262> re(test.pattern, test.flags); EXPECT_EQ(re.parser_result.error, test.expected_error); if constexpr (REGEX_DEBUG) { dbgln("\n"); @@ -586,6 +590,45 @@ TEST_CASE(ECMA262_match) } } +TEST_CASE(ECMA262_unicode_match) +{ + struct _test { + char const* pattern; + char const* subject; + bool matches { true }; + ECMAScriptFlags options {}; + }; + _test tests[] { + { "\\ud83d", "😀", true }, + { "\\ud83d", "😀", false, ECMAScriptFlags::Unicode }, + { "\\ude00", "😀", true }, + { "\\ude00", "😀", false, ECMAScriptFlags::Unicode }, + { "\\ud83d\\ude00", "😀", true }, + { "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode }, + { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true }, + { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode }, + }; + + for (auto& test : tests) { + Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options); + + auto subject = AK::utf8_to_utf16(test.subject); + Utf16View view { subject }; + + if constexpr (REGEX_DEBUG) { + dbgln("\n"); + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); + dbgln("\n"); + } + + EXPECT_EQ(re.parser_result.error, Error::NoError); + EXPECT_EQ(re.match(view).success, test.matches); + } +} + TEST_CASE(replace) { struct _test { |