diff options
author | davidot <david.tuin@gmail.com> | 2021-08-21 11:27:20 +0200 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-08-24 07:42:37 +0100 |
commit | 7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7 (patch) | |
tree | 6b79d356efa5e870b5d72052cb431b8f3dce6b4b | |
parent | b012170d69c5d6a846034591b3a064d12346f8e1 (diff) | |
download | serenity-7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7.zip |
LibJS: Fix some small remaining issues with parsing unicode escapes
Added a test to ensure the behavior stays the same.
We now throw on a direct usage of an escaped keywords with a specific
error to make it more clear to the user.
-rw-r--r-- | Userland/Libraries/LibJS/Lexer.cpp | 27 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Lexer.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Parser.cpp | 34 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js | 69 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Token.cpp | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Token.h | 1 |
6 files changed, 125 insertions, 9 deletions
diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp index 11071b98fe..450755f21f 100644 --- a/Userland/Libraries/LibJS/Lexer.cpp +++ b/Userland/Libraries/LibJS/Lexer.cpp @@ -372,11 +372,14 @@ bool Lexer::is_whitespace() const return false; } -Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const +// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence +// u Hex4Digits +// u{ CodePoint } +Optional<u32> Lexer::is_identifier_unicode_escape(size_t& identifier_length) const { GenericLexer lexer(source().substring_view(m_position - 1)); - if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) { + if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) { identifier_length = lexer.tell(); return code_point_or_error.value(); } @@ -384,13 +387,18 @@ Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const return {}; } +// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart +// UnicodeIDStart +// $ +// _ +// \ UnicodeEscapeSequence Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const { u32 code_point = current_code_point(); identifier_length = 1; if (code_point == '\\') { - if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value()) + if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value()) code_point = *maybe_code_point; else return {}; @@ -406,13 +414,19 @@ Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const return {}; } +// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart +// UnicodeIDContinue +// $ +// \ UnicodeEscapeSequence +// <ZWNJ> +// <ZWJ> Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const { u32 code_point = current_code_point(); identifier_length = 1; if (code_point == '\\') { - if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value()) + if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value()) code_point = *maybe_code_point; else return {}; @@ -574,6 +588,7 @@ Token Lexer::next() token_type = TokenType::TemplateLiteralString; } } else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) { + bool has_escaped_character = false; // identifier or keyword StringBuilder builder; do { @@ -581,6 +596,8 @@ Token Lexer::next() for (size_t i = 0; i < identifier_length; ++i) consume(); + has_escaped_character |= identifier_length > 1; + code_point = is_identifier_middle(identifier_length); } while (code_point.has_value()); @@ -592,7 +609,7 @@ Token Lexer::next() if (it == s_keywords.end()) token_type = TokenType::Identifier; else - token_type = it->value; + token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value; } else if (is_numeric_literal_start()) { token_type = TokenType::NumericLiteral; bool is_invalid_numeric_literal = false; diff --git a/Userland/Libraries/LibJS/Lexer.h b/Userland/Libraries/LibJS/Lexer.h index ea4da7e14a..03991f596d 100644 --- a/Userland/Libraries/LibJS/Lexer.h +++ b/Userland/Libraries/LibJS/Lexer.h @@ -41,7 +41,7 @@ private: bool is_eof() const; bool is_line_terminator() const; bool is_whitespace() const; - Optional<u32> is_unicode_escape(size_t& identifier_length) const; + Optional<u32> is_identifier_unicode_escape(size_t& identifier_length) const; Optional<u32> is_identifier_start(size_t& identifier_length) const; Optional<u32> is_identifier_middle(size_t& identifier_length) const; bool is_line_comment_start(bool line_has_token_yet) const; diff --git a/Userland/Libraries/LibJS/Parser.cpp b/Userland/Libraries/LibJS/Parser.cpp index ec04f51657..68b5e53e09 100644 --- a/Userland/Libraries/LibJS/Parser.cpp +++ b/Userland/Libraries/LibJS/Parser.cpp @@ -404,6 +404,11 @@ NonnullRefPtr<Statement> Parser::parse_statement(AllowLabelledFunction allow_lab m_state.current_token = m_state.lexer.force_slash_as_regex(); [[fallthrough]]; default: + if (m_state.current_token.type() == TokenType::EscapedKeyword + && (m_state.strict_mode + || (m_state.current_token.value() != "yield"sv && m_state.current_token.value() != "let"sv))) + syntax_error("Keyword must not contain escaped characters"); + if (match_identifier_name()) { auto result = try_parse_labelled_statement(allow_labelled_function); if (!result.is_null()) @@ -545,7 +550,7 @@ RefPtr<Statement> Parser::try_parse_labelled_statement(AllowLabelledFunction all load_state(); }; - if (match(TokenType::Yield) && (m_state.strict_mode || m_state.in_generator_function_context)) { + if (m_state.current_token.value() == "yield"sv && (m_state.strict_mode || m_state.in_generator_function_context)) { syntax_error("'yield' label not allowed in this context"); return {}; } @@ -604,7 +609,8 @@ RefPtr<MetaProperty> Parser::try_parse_new_target_expression() consume(); if (!match(TokenType::Identifier)) return {}; - if (consume().value() != "target") + // The string 'target' cannot have escapes so we check original value. + if (consume().original_value() != "target"sv) return {}; state_rollback_guard.disarm(); @@ -847,6 +853,9 @@ Parser::PrimaryExpressionParseResult Parser::parse_primary_expression() if (!m_state.allow_super_property_lookup) syntax_error("'super' keyword unexpected here"); return { create_ast_node<SuperExpression>({ m_state.current_token.filename(), rule_start.position(), position() }) }; + case TokenType::EscapedKeyword: + syntax_error("Keyword must not contain escaped characters"); + [[fallthrough]]; case TokenType::Identifier: { read_as_identifier:; if (!try_parse_arrow_function_expression_failed_at_position(position())) { @@ -2800,6 +2809,14 @@ bool Parser::match_variable_declaration() bool Parser::match_identifier() const { + if (m_state.current_token.type() == TokenType::EscapedKeyword) { + if (m_state.current_token.value() == "let"sv) + return !m_state.strict_mode; + if (m_state.current_token.value() == "yield"sv) + return !m_state.strict_mode && !m_state.in_generator_function_context; + return true; + } + return m_state.current_token.type() == TokenType::Identifier || (m_state.current_token.type() == TokenType::Let && !m_state.strict_mode) || (m_state.current_token.type() == TokenType::Yield && !m_state.in_generator_function_context && !m_state.strict_mode); // See note in Parser::parse_identifier(). @@ -2859,6 +2876,9 @@ Token Parser::consume_identifier() if (match(TokenType::Identifier)) return consume(TokenType::Identifier); + if (match(TokenType::EscapedKeyword)) + return consume(TokenType::EscapedKeyword); + // Note that 'let' is not a reserved keyword, but our lexer considers it such // As it's pretty nice to have that (for syntax highlighting and such), we'll // special-case it here instead. @@ -2884,6 +2904,16 @@ Token Parser::consume_identifier_reference() if (match(TokenType::Identifier)) return consume(TokenType::Identifier); + if (match(TokenType::EscapedKeyword)) { + auto name = m_state.current_token.value(); + if (name == "await"sv) + syntax_error("Identifier reference may not be 'await'"); + else if (m_state.strict_mode && (name == "let"sv || name == "yield"sv)) + syntax_error(String::formatted("'{}' is not allowed as an identifier in strict mode", name)); + + return consume(); + } + // See note in Parser::parse_identifier(). if (match(TokenType::Let)) { if (m_state.strict_mode) diff --git a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js index 4d089f21d9..a9a9a348a2 100644 --- a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js +++ b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js @@ -13,7 +13,74 @@ test("non-ascii escapes", () => { foo.ππ»πΈππ· = 12389; expect(foo.ππ»πΈππ·).toBe(12389); - expect(foo.ππ»\ud835\udcf8ππ·).toBe(12389); expect(foo.ππ»\u{1d4f8}ππ·).toBe(12389); expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389); + + // U-16 High surrogate pair is allowed in string but not in identifier. + expect("foo.ππ»\ud835\udcf8ππ·").toEval(); + expect("foo.ππ»\\ud835\\udcf8ππ·").not.toEval(); +}); + +describe("escaped keywords", () => { + // We must double escape the slashes here else the strings already convert + // the escaped characters (and string is more lenient). + test("keywords cannot be used in an escaped form", () => { + expect("\\u{69}\\u{66}(true) throw 'Should fail'").not.toEval(); + expect("wh\\u{69}le(true) throw 'Should fail'").not.toEval(); + + expect("l\\u{65}t a = 3;").not.toEval(); + expect("function *G(){ yiel\\0064 3; }").not.toEval(); + }); + + test("escaped keywords cannot be used as standalone variables", () => { + expect("var fu\\u{6e}ction = 4").not.toEval(); + expect("var \\u0077ith = 4").not.toEval(); + }); + + test("'yield' and 'let' can be escaped as variables", () => { + var l\u{65}t = 3; + var yi\u0065ld = 5; + expect(let).toBe(3); + expect(yield).toBe(5); + }); + + test("'let' cannot be used in a lexical declaration but 'yield' can", () => { + expect("const l\\u{65}t = 3;").not.toEval(); + + const yi\u0065ld = 5; + expect(yield).toBe(5); + }); + + test("escaped 'yield' and 'let' variables are not allowed in strict mode", () => { + expect("function f() { 'use strict'; var l\\u{65}t = 3; }").not.toEval(); + expect("function g() { 'use strict'; var yi\u0065ld = 5; }").not.toEval(); + }); + + test("cannot use escaped 'yield' variable or label in generator context", () => { + expect("function *g() { var yi\u0065ld = 5; }").not.toEval(); + expect("function *g() { yi\u0065ld: 5; }").not.toEval(); + }); + + test("can use escaped 'let' variable and label in generator context", () => { + expect("function *i() { var \\u{6c}et = 6; }").toEval(); + expect("function *j() { \\u{6c}et: 6; }").toEval(); + }); + + test("can use keywords in some contexts", () => { + var obj = { + \u{69}\u{66}: 3, + wh\u{69}le() { + return 4; + }, + ca\u0073e: "case", + get true() { + return false; + }, + }; + + expect(obj.\u{69}f).toBe(3); + expect(obj.whi\u{6c}e()).toBe(4); + expect(obj.\u{63}ase).toBe("case"); + expect(obj.\u0074r\u{0000075}e).toBeFalse(); + }); }); diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp index bbce9d6e4f..b9c9ebb082 100644 --- a/Userland/Libraries/LibJS/Token.cpp +++ b/Userland/Libraries/LibJS/Token.cpp @@ -204,6 +204,7 @@ bool Token::is_identifier_name() const // The standard defines this reversed: Identifiers are IdentifierNames except reserved words // https://tc39.es/ecma262/#prod-Identifier return m_type == TokenType::Identifier + || m_type == TokenType::EscapedKeyword || m_type == TokenType::Await || m_type == TokenType::BoolLiteral || m_type == TokenType::Break diff --git a/Userland/Libraries/LibJS/Token.h b/Userland/Libraries/LibJS/Token.h index 12c58ae10c..da5161771c 100644 --- a/Userland/Libraries/LibJS/Token.h +++ b/Userland/Libraries/LibJS/Token.h @@ -74,6 +74,7 @@ constexpr const u32 ZERO_WIDTH_JOINER { 0x200D }; __ENUMERATE_JS_TOKEN(Equals, Operator) \ __ENUMERATE_JS_TOKEN(EqualsEquals, Operator) \ __ENUMERATE_JS_TOKEN(EqualsEqualsEquals, Operator) \ + __ENUMERATE_JS_TOKEN(EscapedKeyword, Identifier) \ __ENUMERATE_JS_TOKEN(ExclamationMark, Operator) \ __ENUMERATE_JS_TOKEN(ExclamationMarkEquals, Operator) \ __ENUMERATE_JS_TOKEN(ExclamationMarkEqualsEquals, Operator) \ |