summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAli Mohammad Pur <ali.mpfard@gmail.com>2022-07-20 23:19:43 +0430
committerLinus Groh <mail@linusgroh.de>2022-07-20 21:25:59 +0100
commit77349149092dea81849eac1fee1799df8b3919a6 (patch)
treebbd1f4698d25575f3b914e6553b8cfe3c2e97036
parentb908f9f6ef2fb4f7f97437383bd257e7646c1609 (diff)
downloadserenity-77349149092dea81849eac1fee1799df8b3919a6.zip
LibRegex: Refactor parsing 'CharacterEscape' out of 'AtomEscape'
The ECMA262 spec has this as a separate production, and we need it to be split up for a future commit.
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp93
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.h2
2 files changed, 54 insertions, 41 deletions
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 442a662528..2503f38558 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -1428,59 +1428,34 @@ bool ECMA262Parser::parse_invalid_braced_quantifier()
bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& compares, size_t& match_length_minimum, ParseFlags flags)
{
- if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
- if (auto escape = escape_str.to_uint(); escape.has_value()) {
- // See if this is a "back"-reference (we've already parsed the group it refers to)
- auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
- if (maybe_length.has_value()) {
- match_length_minimum += maybe_length.value();
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
- return true;
- }
- // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
- if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
- // This refers to a future group, and it will _always_ be matching an empty string
- // So just match nothing and move on.
- return true;
- }
- if (!m_should_use_browser_extended_grammar) {
- set_error(Error::InvalidNumber);
- return false;
- }
- }
-
- // If not, put the characters back.
- back(escape_str.length());
- }
-
// CharacterEscape > ControlEscape
if (try_skip("f"sv)) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\f' });
return true;
}
if (try_skip("n"sv)) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\n' });
return true;
}
if (try_skip("r"sv)) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\r' });
return true;
}
if (try_skip("t"sv)) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\t' });
return true;
}
if (try_skip("v"sv)) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\v' });
return true;
}
@@ -1489,7 +1464,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
for (auto c : s_alphabetic_characters) {
if (try_skip({ &c, 1 })) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)(c % 32) });
return true;
}
}
@@ -1500,15 +1475,15 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
}
if (m_should_use_browser_extended_grammar) {
- back(1 + !done());
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
+ back(1 + (done() ? 0 : 1));
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\\' });
match_length_minimum += 1;
return true;
}
// Allow '\c' in non-unicode mode, just matches 'c'.
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'c' });
return true;
}
@@ -1516,7 +1491,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (try_skip("0"sv)) {
if (!lookahead_any(s_decimal_characters)) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)0 });
return true;
}
@@ -1527,7 +1502,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (m_should_use_browser_extended_grammar) {
if (!flags.unicode) {
if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)escape.value() });
match_length_minimum += 1;
return true;
}
@@ -1538,13 +1513,13 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (try_skip("x"sv)) {
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() });
return true;
}
if (!flags.unicode) {
// '\x' is allowed in non-unicode mode, just matches 'x'.
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'x' });
return true;
}
@@ -1555,7 +1530,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (try_skip("u"sv)) {
if (auto code_point = consume_escaped_code_point(flags.unicode); code_point.has_value()) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)code_point.value() });
return true;
}
@@ -1566,7 +1541,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
for (auto ch : identity_escape_characters(flags.unicode, m_should_use_browser_extended_grammar)) {
if (try_skip({ &ch, 1 })) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)ch });
return true;
}
}
@@ -1574,11 +1549,47 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (flags.unicode) {
if (try_skip("/"sv)) {
match_length_minimum += 1;
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
+ compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'/' });
return true;
}
}
+ return false;
+}
+
+bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags)
+{
+ if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
+ if (auto escape = escape_str.to_uint(); escape.has_value()) {
+ // See if this is a "back"-reference (we've already parsed the group it refers to)
+ auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
+ if (maybe_length.has_value()) {
+ match_length_minimum += maybe_length.value();
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
+ return true;
+ }
+ // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
+ if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
+ // This refers to a future group, and it will _always_ be matching an empty string
+ // So just match nothing and move on.
+ return true;
+ }
+ if (!m_should_use_browser_extended_grammar) {
+ set_error(Error::InvalidNumber);
+ return false;
+ }
+ }
+
+ // If not, put the characters back.
+ back(escape_str.length());
+ }
+
+ Vector<CompareTypeAndValuePair> escape_compares;
+ if (parse_character_escape(escape_compares, match_length_minimum, flags)) {
+ stack.insert_bytecode_compare_values(move(escape_compares));
+ return true;
+ }
+
if (flags.named && try_skip("k"sv)) {
auto name = read_capture_group_specifier(true);
if (name.is_empty()) {
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index 7f4e8104a2..3dbe5438d3 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -255,6 +255,8 @@ private:
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, ParseFlags);
bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
+ bool parse_character_escape(Vector<CompareTypeAndValuePair>&, size_t&, ParseFlags);
+
// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags);
bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);