diff options
author | Ali Mohammad Pur <ali.mpfard@gmail.com> | 2022-07-20 23:19:43 +0430 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2022-07-20 21:25:59 +0100 |
commit | 77349149092dea81849eac1fee1799df8b3919a6 (patch) | |
tree | bbd1f4698d25575f3b914e6553b8cfe3c2e97036 | |
parent | b908f9f6ef2fb4f7f97437383bd257e7646c1609 (diff) | |
download | serenity-77349149092dea81849eac1fee1799df8b3919a6.zip |
LibRegex: Refactor parsing 'CharacterEscape' out of 'AtomEscape'
The ECMA262 spec has this as a separate production, and we need it to be
split up for a future commit.
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 93 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 2 |
2 files changed, 54 insertions, 41 deletions
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 442a662528..2503f38558 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1428,59 +1428,34 @@ bool ECMA262Parser::parse_invalid_braced_quantifier() bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& compares, size_t& match_length_minimum, ParseFlags flags) { - if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) { - if (auto escape = escape_str.to_uint(); escape.has_value()) { - // See if this is a "back"-reference (we've already parsed the group it refers to) - auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value()); - if (maybe_length.has_value()) { - match_length_minimum += maybe_length.value(); - stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } }); - return true; - } - // It's not a pattern seen before, so we have to see if it's a valid reference to a future group. - if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) { - // This refers to a future group, and it will _always_ be matching an empty string - // So just match nothing and move on. - return true; - } - if (!m_should_use_browser_extended_grammar) { - set_error(Error::InvalidNumber); - return false; - } - } - - // If not, put the characters back. - back(escape_str.length()); - } - // CharacterEscape > ControlEscape if (try_skip("f"sv)) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\f' }); return true; } if (try_skip("n"sv)) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\n' }); return true; } if (try_skip("r"sv)) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\r' }); return true; } if (try_skip("t"sv)) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\t' }); return true; } if (try_skip("v"sv)) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\v' }); return true; } @@ -1489,7 +1464,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp for (auto c : s_alphabetic_characters) { if (try_skip({ &c, 1 })) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)(c % 32) }); return true; } } @@ -1500,15 +1475,15 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp } if (m_should_use_browser_extended_grammar) { - back(1 + !done()); - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } }); + back(1 + (done() ? 0 : 1)); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\\' }); match_length_minimum += 1; return true; } // Allow '\c' in non-unicode mode, just matches 'c'. match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'c' }); return true; } @@ -1516,7 +1491,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp if (try_skip("0"sv)) { if (!lookahead_any(s_decimal_characters)) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)0 }); return true; } @@ -1527,7 +1502,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp if (m_should_use_browser_extended_grammar) { if (!flags.unicode) { if (auto escape = parse_legacy_octal_escape(); escape.has_value()) { - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)escape.value() }); match_length_minimum += 1; return true; } @@ -1538,13 +1513,13 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp if (try_skip("x"sv)) { if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() }); return true; } if (!flags.unicode) { // '\x' is allowed in non-unicode mode, just matches 'x'. match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'x' }); return true; } @@ -1555,7 +1530,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp if (try_skip("u"sv)) { if (auto code_point = consume_escaped_code_point(flags.unicode); code_point.has_value()) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)code_point.value() }); return true; } @@ -1566,7 +1541,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp for (auto ch : identity_escape_characters(flags.unicode, m_should_use_browser_extended_grammar)) { if (try_skip({ &ch, 1 })) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)ch }); return true; } } @@ -1574,11 +1549,47 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp if (flags.unicode) { if (try_skip("/"sv)) { match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } }); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'/' }); return true; } } + return false; +} + +bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags) +{ + if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) { + if (auto escape = escape_str.to_uint(); escape.has_value()) { + // See if this is a "back"-reference (we've already parsed the group it refers to) + auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value()); + if (maybe_length.has_value()) { + match_length_minimum += maybe_length.value(); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } }); + return true; + } + // It's not a pattern seen before, so we have to see if it's a valid reference to a future group. + if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) { + // This refers to a future group, and it will _always_ be matching an empty string + // So just match nothing and move on. + return true; + } + if (!m_should_use_browser_extended_grammar) { + set_error(Error::InvalidNumber); + return false; + } + } + + // If not, put the characters back. + back(escape_str.length()); + } + + Vector<CompareTypeAndValuePair> escape_compares; + if (parse_character_escape(escape_compares, match_length_minimum, flags)) { + stack.insert_bytecode_compare_values(move(escape_compares)); + return true; + } + if (flags.named && try_skip("k"sv)) { auto name = read_capture_group_specifier(true); if (name.is_empty()) { diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 7f4e8104a2..3dbe5438d3 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -255,6 +255,8 @@ private: bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, ParseFlags); bool parse_unicode_property_escape(PropertyEscape& property, bool& negated); + bool parse_character_escape(Vector<CompareTypeAndValuePair>&, size_t&, ParseFlags); + // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers) bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags); bool parse_extended_atom(ByteCode&, size_t&, ParseFlags); |