summaryrefslogtreecommitdiff
path: root/Libraries
diff options
context:
space:
mode:
authorAnotherTest <ali.mpfard@gmail.com>2020-11-28 12:27:59 +0330
committerAndreas Kling <kling@serenityos.org>2020-11-28 10:13:33 +0100
commite2fa1b40c4ce16a566da76373b37934475b44904 (patch)
treef244be66f7cf5d50ea5384ea002c19fbe6a69542 /Libraries
parent801750b95a6ba5282db6e9f7917e25535c067127 (diff)
downloadserenity-e2fa1b40c4ce16a566da76373b37934475b44904.zip
LibRegex: Allow unknown escapes in non-unicode mode (for ECMA262)
This makes regexps like `/\x/` to work as normal. Partially deals with #4189.
Diffstat (limited to 'Libraries')
-rw-r--r--Libraries/LibRegex/RegexLexer.cpp7
-rw-r--r--Libraries/LibRegex/RegexLexer.h1
-rw-r--r--Libraries/LibRegex/RegexParser.cpp59
-rw-r--r--Libraries/LibRegex/RegexParser.h1
4 files changed, 64 insertions, 4 deletions
diff --git a/Libraries/LibRegex/RegexLexer.cpp b/Libraries/LibRegex/RegexLexer.cpp
index d7ad83e11c..a9439ff656 100644
--- a/Libraries/LibRegex/RegexLexer.cpp
+++ b/Libraries/LibRegex/RegexLexer.cpp
@@ -99,6 +99,13 @@ bool Lexer::try_skip(char c)
return true;
}
+char Lexer::skip()
+{
+ auto c = peek();
+ consume();
+ return c;
+}
+
Token Lexer::next()
{
size_t token_start_position;
diff --git a/Libraries/LibRegex/RegexLexer.h b/Libraries/LibRegex/RegexLexer.h
index 1d930f3925..7a54dea091 100644
--- a/Libraries/LibRegex/RegexLexer.h
+++ b/Libraries/LibRegex/RegexLexer.h
@@ -92,6 +92,7 @@ public:
void back(size_t offset);
void set_source(const StringView source) { m_source = source; }
bool try_skip(char);
+ char skip();
StringView slice_back(size_t offset) const { return m_source.substring_view(m_position - offset - 1, offset); }
diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp
index 1fb3d569cd..d4f906e206 100644
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@@ -108,6 +108,20 @@ ALWAYS_INLINE bool Parser::try_skip(StringView str)
return true;
}
+ALWAYS_INLINE char Parser::skip()
+{
+ char ch;
+ if (m_parser_state.current_token.value().length() == 1) {
+ ch = m_parser_state.current_token.value()[0];
+ } else {
+ m_parser_state.lexer.back(m_parser_state.current_token.value().length());
+ ch = m_parser_state.lexer.skip();
+ }
+
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ return ch;
+}
+
ALWAYS_INLINE void Parser::reset()
{
m_parser_state.bytecode.clear();
@@ -1017,6 +1031,16 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
return true;
}
}
+
+ if (unicode) {
+ set_error(Error::InvalidPattern);
+ return false;
+ }
+
+ // Allow '\c' in non-unicode mode, just matches 'c'.
+ match_length_minimum += 1;
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
+ return true;
}
// '\0'
@@ -1032,6 +1056,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
return true;
+ } else if (!unicode) {
+ // '\x' is allowed in non-unicode mode, just matches 'x'.
+ match_length_minimum += 1;
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
+ return true;
+ } else {
+ set_error(Error::InvalidPattern);
+ return false;
}
}
@@ -1088,6 +1120,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
bool negate = false;
auto ch = parse_character_class_escape(negate);
if (!ch.has_value()) {
+ if (!unicode) {
+ // Allow all SourceCharacter's as escapes here.
+ auto token = consume();
+ match_length_minimum += 1;
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
+ return true;
+ }
+
set_error(Error::InvalidCharacterClass);
return false;
}
@@ -1203,8 +1243,15 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
// HexEscape
if (try_skip("x")) {
- if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
+ if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
return { { .code_point = hex_escape.value(), .is_character_class = false } };
+ } else if (!unicode) {
+ // '\x' is allowed in non-unicode mode, just matches 'x'.
+ return { { .code_point = 'x', .is_character_class = false } };
+ } else {
+ set_error(Error::InvalidPattern);
+ return {};
+ }
}
if (try_skip("u")) {
@@ -1234,14 +1281,18 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
if (try_skip("W"))
return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
+
+ if (!unicode) {
+ // Any unrecognised escape is allowed in non-unicode mode.
+ return { { .code_point = (u32)skip(), .is_character_class = false } };
+ }
}
if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
return {};
- auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
-
- return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
+ // Allow any (other) SourceCharacter.
+ return { { .code_point = (u32)skip(), .is_character_class = false } };
};
auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
if (match(TokenType::HyphenMinus)) {
diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h
index b9d89262e7..7227e9fe84 100644
--- a/Libraries/LibRegex/RegexParser.h
+++ b/Libraries/LibRegex/RegexParser.h
@@ -94,6 +94,7 @@ protected:
ALWAYS_INLINE Token consume(TokenType type, Error error);
ALWAYS_INLINE bool consume(const String&);
ALWAYS_INLINE bool try_skip(StringView);
+ ALWAYS_INLINE char skip();
ALWAYS_INLINE void reset();
ALWAYS_INLINE bool done() const;
ALWAYS_INLINE bool set_error(Error error);