LibRegex: Allow references to capture groups that aren't parsed yet

This only applies to the ECMA262 parser. This behaviour is an ECMA262-specific quirk, such references always generate zero-length matches (even on subsequent passes). Also adds a test in LibJS's test suite. Fixes #6039.
author: AnotherTest <ali.mpfard@gmail.com> 2021-04-01 18:30:47 +0430
committer: Andreas Kling <kling@serenityos.org> 2021-04-01 21:55:47 +0200
commit: 6bbb26fdaf19e95c8082819d836f46cd7d1f7a86 (patch)
tree: a96c169ad02a9bfe8e916b6cb435c48745bce7c8
parent: 804ab79995eeaa595073d4395a77792d61f29ec5 (diff)
download: serenity-6bbb26fdaf19e95c8082819d836f46cd7d1f7a86.zip
6 files changed, 80 insertions, 6 deletions
diff --git a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js
index 5747ae487e..3cdd5dd8e3 100644
--- a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js
+++ b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js
@@ -66,3 +66,15 @@ test("not matching", () => {
 
     expect(res).toBe(null);
 });
+
+// Backreference to a group not yet parsed: #6039
+test("Future group backreference, #6039", () => {
+    let re = /(\3)(\1)(a)/;
+    let result = re.exec("cat");
+    expect(result.length).toBe(4);
+    expect(result[0]).toBe("a");
+    expect(result[1]).toBe("");
+    expect(result[2]).toBe("");
+    expect(result[3]).toBe("a");
+    expect(result.index).toBe(1);
+});
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index d9b91a6ec4..00578fb211 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -399,6 +399,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
 
     size_t string_position = state.string_position;
     bool inverse_matched { false };
+    bool had_zero_length_match { false };
 
     size_t offset { state.instruction_position + 3 };
     for (size_t i = 0; i < arguments_count(); ++i) {
@@ -454,7 +455,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             if (input.view.length() - state.string_position < length)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length))
+            if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match))
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
         } else if (compare_type == CharacterCompareType::CharClass) {
@@ -488,7 +489,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             if (input.view.length() - state.string_position < str.length())
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
+            if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
         } else if (compare_type == CharacterCompareType::NamedReference) {
@@ -506,7 +507,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             if (input.view.length() - state.string_position < str.length())
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
+            if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
         } else {
@@ -519,7 +520,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
     if (current_inversion_state() && !inverse_matched)
         ++state.string_position;
 
-    if (string_position == state.string_position || state.string_position > input.view.length())
+    if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
         return ExecutionResult::Failed_ExecuteLowPrioForks;
 
     return ExecutionResult::Continue;
@@ -542,7 +543,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchSt
     }
 }
 
-ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length)
+ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match)
 {
     if (input.view.is_u8_view()) {
         auto str_view1 = StringView(str, length);
@@ -558,6 +559,8 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, Match
 
         if (str_view1 == str_view2) {
             state.string_position += length;
+            if (length == 0)
+                had_zero_length_match = true;
             return true;
         }
     }
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
index 1909e36a91..d8021adf20 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -768,7 +768,7 @@ public:
 
 private:
     ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
-    ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length);
+    ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match);
     ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
 };
diff --git a/Userland/Libraries/LibRegex/RegexLexer.h b/Userland/Libraries/LibRegex/RegexLexer.h
index 959fa50c33..d6076cee8d 100644
--- a/Userland/Libraries/LibRegex/RegexLexer.h
+++ b/Userland/Libraries/LibRegex/RegexLexer.h
@@ -93,6 +93,7 @@ public:
     void set_source(const StringView source) { m_source = source; }
     bool try_skip(char);
     char skip();
+    const auto& source() const { return m_source; }
 
 private:
     ALWAYS_INLINE char peek(size_t offset = 0) const;
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 3bf96475d2..fe00adfd97 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -1114,12 +1114,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
 {
     if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); !escape_str.is_empty()) {
         if (auto escape = escape_str.to_uint(); escape.has_value()) {
+            // See if this is a "back"-reference (we've already parsed the group it refers to)
             auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
             if (maybe_length.has_value()) {
                 match_length_minimum += maybe_length.value();
                 stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
                 return true;
             }
+            // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
+            if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
+                // This refers to a future group, and it will _always_ be matching an empty string
+                // So just match nothing and move on.
+                return true;
+            }
             if (!m_should_use_browser_extended_grammar) {
                 set_error(Error::InvalidNumber);
                 return false;
@@ -1729,4 +1736,47 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
 
     return true;
 }
+
+size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
+{
+    if (m_total_number_of_capturing_parenthesis.has_value())
+        return m_total_number_of_capturing_parenthesis.value();
+
+    GenericLexer lexer { m_parser_state.lexer.source() };
+    size_t count = 0;
+    while (!lexer.is_eof()) {
+        switch (lexer.peek()) {
+        case '\\':
+            lexer.consume(2);
+            continue;
+        case '[':
+            while (!lexer.is_eof()) {
+                if (lexer.consume_specific('\\'))
+                    lexer.consume();
+                else if (lexer.consume_specific(']'))
+                    break;
+                lexer.consume();
+            }
+            break;
+        case '(':
+            if (lexer.consume_specific('?')) {
+                // non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
+                if (!lexer.consume_specific('<'))
+                    break;
+
+                if (lexer.next_is(is_any_of("=!")))
+                    break;
+
+                ++count;
+            } else {
+                ++count;
+            }
+            break;
+        }
+        lexer.consume();
+    }
+
+    m_total_number_of_capturing_parenthesis = count;
+    return count;
+}
 }
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index c8f24b094c..dc4e682fd2 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -210,6 +210,14 @@ private:
     bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
     Optional<u8> parse_legacy_octal_escape();
 
+    size_t ensure_total_number_of_capturing_parenthesis();
+
+    // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
+    // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
+    // parse it twice, so we'll just do so when it's actually needed.
+    // Most patterns should have no need to ever populate this field.
+    Optional<size_t> m_total_number_of_capturing_parenthesis;
+
     // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
     bool m_should_use_browser_extended_grammar { false };
 };
author	AnotherTest <ali.mpfard@gmail.com>	2021-04-01 18:30:47 +0430
committer	Andreas Kling <kling@serenityos.org>	2021-04-01 21:55:47 +0200
commit	6bbb26fdaf19e95c8082819d836f46cd7d1f7a86 (patch)
tree	a96c169ad02a9bfe8e916b6cb435c48745bce7c8
parent	804ab79995eeaa595073d4395a77792d61f29ec5 (diff)
download	serenity-6bbb26fdaf19e95c8082819d836f46cd7d1f7a86.zip