LibRegex: Allow unknown escapes in non-unicode mode (for ECMA262)

This makes regexps like `/\x/` to work as normal. Partially deals with #4189.
author: AnotherTest <ali.mpfard@gmail.com> 2020-11-28 12:27:59 +0330
committer: Andreas Kling <kling@serenityos.org> 2020-11-28 10:13:33 +0100
commit: e2fa1b40c4ce16a566da76373b37934475b44904 (patch)
tree: f244be66f7cf5d50ea5384ea002c19fbe6a69542 /Libraries
parent: 801750b95a6ba5282db6e9f7917e25535c067127 (diff)
download: serenity-e2fa1b40c4ce16a566da76373b37934475b44904.zip
4 files changed, 64 insertions, 4 deletions
diff --git a/Libraries/LibRegex/RegexLexer.cpp b/Libraries/LibRegex/RegexLexer.cpp
index d7ad83e11c..a9439ff656 100644
--- a/Libraries/LibRegex/RegexLexer.cpp
+++ b/Libraries/LibRegex/RegexLexer.cpp
@@ -99,6 +99,13 @@ bool Lexer::try_skip(char c)
     return true;
 }
 
+char Lexer::skip()
+{
+    auto c = peek();
+    consume();
+    return c;
+}
+
 Token Lexer::next()
 {
     size_t token_start_position;
diff --git a/Libraries/LibRegex/RegexLexer.h b/Libraries/LibRegex/RegexLexer.h
index 1d930f3925..7a54dea091 100644
--- a/Libraries/LibRegex/RegexLexer.h
+++ b/Libraries/LibRegex/RegexLexer.h
@@ -92,6 +92,7 @@ public:
     void back(size_t offset);
     void set_source(const StringView source) { m_source = source; }
     bool try_skip(char);
+    char skip();
 
     StringView slice_back(size_t offset) const { return m_source.substring_view(m_position - offset - 1, offset); }
 
diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp
index 1fb3d569cd..d4f906e206 100644
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@@ -108,6 +108,20 @@ ALWAYS_INLINE bool Parser::try_skip(StringView str)
     return true;
 }
 
+ALWAYS_INLINE char Parser::skip()
+{
+    char ch;
+    if (m_parser_state.current_token.value().length() == 1) {
+        ch = m_parser_state.current_token.value()[0];
+    } else {
+        m_parser_state.lexer.back(m_parser_state.current_token.value().length());
+        ch = m_parser_state.lexer.skip();
+    }
+
+    m_parser_state.current_token = m_parser_state.lexer.next();
+    return ch;
+}
+
 ALWAYS_INLINE void Parser::reset()
 {
     m_parser_state.bytecode.clear();
@@ -1017,6 +1031,16 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
                 return true;
             }
         }
+
+        if (unicode) {
+            set_error(Error::InvalidPattern);
+            return false;
+        }
+
+        // Allow '\c' in non-unicode mode, just matches 'c'.
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
+        return true;
     }
 
     // '\0'
@@ -1032,6 +1056,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
             match_length_minimum += 1;
             stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
             return true;
+        } else if (!unicode) {
+            // '\x' is allowed in non-unicode mode, just matches 'x'.
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
+            return true;
+        } else {
+            set_error(Error::InvalidPattern);
+            return false;
         }
     }
 
@@ -1088,6 +1120,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
     bool negate = false;
     auto ch = parse_character_class_escape(negate);
     if (!ch.has_value()) {
+        if (!unicode) {
+            // Allow all SourceCharacter's as escapes here.
+            auto token = consume();
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
+            return true;
+        }
+
         set_error(Error::InvalidCharacterClass);
         return false;
     }
@@ -1203,8 +1243,15 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
 
             // HexEscape
             if (try_skip("x")) {
-                if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
+                if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
                     return { { .code_point = hex_escape.value(), .is_character_class = false } };
+                } else if (!unicode) {
+                    // '\x' is allowed in non-unicode mode, just matches 'x'.
+                    return { { .code_point = 'x', .is_character_class = false } };
+                } else {
+                    set_error(Error::InvalidPattern);
+                    return {};
+                }
             }
 
             if (try_skip("u")) {
@@ -1234,14 +1281,18 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                 return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
             if (try_skip("W"))
                 return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
+
+            if (!unicode) {
+                // Any unrecognised escape is allowed in non-unicode mode.
+                return { { .code_point = (u32)skip(), .is_character_class = false } };
+            }
         }
 
         if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
             return {};
 
-        auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
-
-        return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
+        // Allow any (other) SourceCharacter.
+        return { { .code_point = (u32)skip(), .is_character_class = false } };
     };
     auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
         if (match(TokenType::HyphenMinus)) {
diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h
index b9d89262e7..7227e9fe84 100644
--- a/Libraries/LibRegex/RegexParser.h
+++ b/Libraries/LibRegex/RegexParser.h
@@ -94,6 +94,7 @@ protected:
     ALWAYS_INLINE Token consume(TokenType type, Error error);
     ALWAYS_INLINE bool consume(const String&);
     ALWAYS_INLINE bool try_skip(StringView);
+    ALWAYS_INLINE char skip();
     ALWAYS_INLINE void reset();
     ALWAYS_INLINE bool done() const;
     ALWAYS_INLINE bool set_error(Error error);
author	AnotherTest <ali.mpfard@gmail.com>	2020-11-28 12:27:59 +0330
committer	Andreas Kling <kling@serenityos.org>	2020-11-28 10:13:33 +0100
commit	e2fa1b40c4ce16a566da76373b37934475b44904 (patch)
tree	f244be66f7cf5d50ea5384ea002c19fbe6a69542 /Libraries
parent	801750b95a6ba5282db6e9f7917e25535c067127 (diff)
download	serenity-e2fa1b40c4ce16a566da76373b37934475b44904.zip