summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-08-11 16:41:57 -0400
committerLinus Groh <mail@linusgroh.de>2021-08-15 11:43:45 +0100
commit6a485f612fbda354c8c67e9ccc3c119356687354 (patch)
tree193fda6eb17b87d404501de0fbf028db6997ebce
parent83ca8c7e381c35ee61a957c1fc82a9db9c6db6fb (diff)
downloadserenity-6a485f612fbda354c8c67e9ccc3c119356687354.zip
LibRegex: Implement legacy octal escape parsing closer to the spec
The grammar for the ECMA-262 CharacterEscape is: CharacterEscape[U, N] :: ControlEscape c ControlLetter 0 [lookahead ∉ DecimalDigit] HexEscapeSequence RegExpUnicodeEscapeSequence[?U] [~U]LegacyOctalEscapeSequence IdentityEscape[?U, ?N] It's important to parse the standalone "\0 [lookahead ∉ DecimalDigit]" before parsing LegacyOctalEscapeSequence. Otherwise, all standalone "\0" patterns are parsed as octal, which are disallowed in Unicode mode. Further, LegacyOctalEscapeSequence should also be parsed while parsing character classes.
-rw-r--r--Tests/LibRegex/Regex.cpp20
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp32
2 files changed, 43 insertions, 9 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp
index 36ac6f1498..2e8cce5c30 100644
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@@ -22,6 +22,12 @@ static PosixOptions match_test_api_options(const PosixOptions options)
return options;
}
+template<typename... Flags>
+static constexpr ECMAScriptFlags combine_flags(Flags&&... flags) requires((IsSame<Flags, ECMAScriptFlags> && ...))
+{
+ return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...));
+}
+
TEST_CASE(regex_options_ecmascript)
{
ECMAScriptOptions eo;
@@ -543,6 +549,14 @@ TEST_CASE(ECMA262_parse)
{ "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
{ "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+ { "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
+ { "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
+ { "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
+ { "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
+ { "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
+ { "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
+ { "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
+ { "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
};
for (auto& test : tests) {
@@ -606,6 +620,12 @@ TEST_CASE(ECMA262_match)
"return /xx/"sv, true, ECMAScriptFlags::BrowserExtended
}, // #5517, appears to be matching JS expressions that involve regular expressions...
{ "a{2,}"sv, "aaaa"sv }, // #5518
+ { "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
+ { "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
+ { "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
+ { "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
+ { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
+ { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
};
// clang-format on
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index a19faa02f0..a0393a6ac0 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -18,6 +18,7 @@ namespace regex {
static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
+static constexpr auto s_decimal_characters = "0123456789"sv;
ALWAYS_INLINE bool Parser::set_error(Error error)
{
@@ -1430,6 +1431,17 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
return true;
}
+ // '\0'
+ if (try_skip("0")) {
+ if (!lookahead_any(s_decimal_characters)) {
+ match_length_minimum += 1;
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
+ return true;
+ }
+
+ back();
+ }
+
// LegacyOctalEscapeSequence
if (m_should_use_browser_extended_grammar) {
if (!unicode) {
@@ -1441,13 +1453,6 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
}
}
- // '\0'
- if (try_skip("0")) {
- match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
- return true;
- }
-
// HexEscape
if (try_skip("x")) {
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
@@ -1797,8 +1802,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
}
// '\0'
- if (try_skip("0"))
- return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
+ if (try_skip("0")) {
+ if (!lookahead_any(s_decimal_characters))
+ return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
+ back();
+ }
+
+ // LegacyOctalEscapeSequence
+ if (m_should_use_browser_extended_grammar && !unicode) {
+ if (auto escape = parse_legacy_octal_escape(); escape.has_value())
+ return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } };
+ }
// HexEscape
if (try_skip("x")) {