diff options
author | AnotherTest <ali.mpfard@gmail.com> | 2020-11-28 12:27:59 +0330 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-11-28 10:13:33 +0100 |
commit | e2fa1b40c4ce16a566da76373b37934475b44904 (patch) | |
tree | f244be66f7cf5d50ea5384ea002c19fbe6a69542 /Libraries | |
parent | 801750b95a6ba5282db6e9f7917e25535c067127 (diff) | |
download | serenity-e2fa1b40c4ce16a566da76373b37934475b44904.zip |
LibRegex: Allow unknown escapes in non-unicode mode (for ECMA262)
This makes regexps like `/\x/` to work as normal.
Partially deals with #4189.
Diffstat (limited to 'Libraries')
-rw-r--r-- | Libraries/LibRegex/RegexLexer.cpp | 7 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexLexer.h | 1 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexParser.cpp | 59 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexParser.h | 1 |
4 files changed, 64 insertions, 4 deletions
diff --git a/Libraries/LibRegex/RegexLexer.cpp b/Libraries/LibRegex/RegexLexer.cpp index d7ad83e11c..a9439ff656 100644 --- a/Libraries/LibRegex/RegexLexer.cpp +++ b/Libraries/LibRegex/RegexLexer.cpp @@ -99,6 +99,13 @@ bool Lexer::try_skip(char c) return true; } +char Lexer::skip() +{ + auto c = peek(); + consume(); + return c; +} + Token Lexer::next() { size_t token_start_position; diff --git a/Libraries/LibRegex/RegexLexer.h b/Libraries/LibRegex/RegexLexer.h index 1d930f3925..7a54dea091 100644 --- a/Libraries/LibRegex/RegexLexer.h +++ b/Libraries/LibRegex/RegexLexer.h @@ -92,6 +92,7 @@ public: void back(size_t offset); void set_source(const StringView source) { m_source = source; } bool try_skip(char); + char skip(); StringView slice_back(size_t offset) const { return m_source.substring_view(m_position - offset - 1, offset); } diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp index 1fb3d569cd..d4f906e206 100644 --- a/Libraries/LibRegex/RegexParser.cpp +++ b/Libraries/LibRegex/RegexParser.cpp @@ -108,6 +108,20 @@ ALWAYS_INLINE bool Parser::try_skip(StringView str) return true; } +ALWAYS_INLINE char Parser::skip() +{ + char ch; + if (m_parser_state.current_token.value().length() == 1) { + ch = m_parser_state.current_token.value()[0]; + } else { + m_parser_state.lexer.back(m_parser_state.current_token.value().length()); + ch = m_parser_state.lexer.skip(); + } + + m_parser_state.current_token = m_parser_state.lexer.next(); + return ch; +} + ALWAYS_INLINE void Parser::reset() { m_parser_state.bytecode.clear(); @@ -1017,6 +1031,16 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini return true; } } + + if (unicode) { + set_error(Error::InvalidPattern); + return false; + } + + // Allow '\c' in non-unicode mode, just matches 'c'. + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } }); + return true; } // '\0' @@ -1032,6 +1056,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini match_length_minimum += 1; stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } }); return true; + } else if (!unicode) { + // '\x' is allowed in non-unicode mode, just matches 'x'. + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } }); + return true; + } else { + set_error(Error::InvalidPattern); + return false; } } @@ -1088,6 +1120,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini bool negate = false; auto ch = parse_character_class_escape(negate); if (!ch.has_value()) { + if (!unicode) { + // Allow all SourceCharacter's as escapes here. + auto token = consume(); + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } }); + return true; + } + set_error(Error::InvalidCharacterClass); return false; } @@ -1203,8 +1243,15 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& // HexEscape if (try_skip("x")) { - if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) + if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) { return { { .code_point = hex_escape.value(), .is_character_class = false } }; + } else if (!unicode) { + // '\x' is allowed in non-unicode mode, just matches 'x'. + return { { .code_point = 'x', .is_character_class = false } }; + } else { + set_error(Error::InvalidPattern); + return {}; + } } if (try_skip("u")) { @@ -1234,14 +1281,18 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } }; if (try_skip("W")) return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } }; + + if (!unicode) { + // Any unrecognised escape is allowed in non-unicode mode. + return { { .code_point = (u32)skip(), .is_character_class = false } }; + } } if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus)) return {}; - auto token = consume(TokenType::Char, Error::InvalidCharacterClass); - - return { { .code_point = (u32)token.value()[0], .is_character_class = false } }; + // Allow any (other) SourceCharacter. + return { { .code_point = (u32)skip(), .is_character_class = false } }; }; auto read_class_atom = [&]() -> Optional<CharClassRangeElement> { if (match(TokenType::HyphenMinus)) { diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h index b9d89262e7..7227e9fe84 100644 --- a/Libraries/LibRegex/RegexParser.h +++ b/Libraries/LibRegex/RegexParser.h @@ -94,6 +94,7 @@ protected: ALWAYS_INLINE Token consume(TokenType type, Error error); ALWAYS_INLINE bool consume(const String&); ALWAYS_INLINE bool try_skip(StringView); + ALWAYS_INLINE char skip(); ALWAYS_INLINE void reset(); ALWAYS_INLINE bool done() const; ALWAYS_INLINE bool set_error(Error error); |