diff options
author | Timothy Flynn <trflynn89@pm.me> | 2022-02-05 10:43:13 -0500 |
---|---|---|
committer | Ali Mohammad Pur <Ali.mpfard@gmail.com> | 2022-02-05 22:30:10 +0330 |
commit | 2212aa2388c4f6abae577daa2cbb27f8796939d4 (patch) | |
tree | 7ae0b0f248e7e2c537a892a5b6bbaea8b15a6ea1 /Userland/Libraries/LibRegex | |
parent | 54845c4bf2fe9db0ff9e661f18f883dc3e7f89ba (diff) | |
download | serenity-2212aa2388c4f6abae577daa2cbb27f8796939d4.zip |
LibRegex: Support non-ASCII whitespace characters when matching \s or \S
ECMA-262 defines \s as:
Return the CharSet containing all characters corresponding to a code
point on the right-hand side of the WhiteSpace or LineTerminator
productions.
The LineTerminator production is simply: U+000A, U+000D, U+2028, or
U+2029. Unfortunately there isn't a Unicode property that covers just
those code points.
The WhiteSpace production is: U+0009, U+000B, U+000C, U+FEFF, or any
code point with the Space_Separator general category.
If the Unicode generators are disabled, this will fall back to ASCII
space code points.
Diffstat (limited to 'Userland/Libraries/LibRegex')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index eb9b19dbc9..4a1021ff77 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -659,6 +659,18 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, Match ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched) { + auto is_space_or_line_terminator = [](u32 code_point) { + static auto space_separator = Unicode::general_category_from_string("Space_Separator"sv); + if (!space_separator.has_value()) + return is_ascii_space(code_point); + + if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029)) + return true; + if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff)) + return true; + return Unicode::code_point_has_general_category(code_point, *space_separator); + }; + switch (character_class) { case CharClass::Alnum: if (is_ascii_alphanumeric(ch)) { @@ -729,7 +741,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp } break; case CharClass::Space: - if (is_ascii_space(ch)) { + if (is_space_or_line_terminator(ch)) { if (inverse) inverse_matched = true; else |