LibJS: Fix some small remaining issues with parsing unicode escapes

Added a test to ensure the behavior stays the same. We now throw on a direct usage of an escaped keywords with a specific error to make it more clear to the user.
author: davidot <david.tuin@gmail.com> 2021-08-21 11:27:20 +0200
committer: Linus Groh <mail@linusgroh.de> 2021-08-24 07:42:37 +0100
commit: 7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7 (patch)
tree: 6b79d356efa5e870b5d72052cb431b8f3dce6b4b
parent: b012170d69c5d6a846034591b3a064d12346f8e1 (diff)
download: serenity-7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7.zip
6 files changed, 125 insertions, 9 deletions
diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp
index 11071b98fe..450755f21f 100644
--- a/Userland/Libraries/LibJS/Lexer.cpp
+++ b/Userland/Libraries/LibJS/Lexer.cpp
@@ -372,11 +372,14 @@ bool Lexer::is_whitespace() const
     return false;
 }
 
-Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
+// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence
+//          u Hex4Digits
+//          u{ CodePoint }
+Optional<u32> Lexer::is_identifier_unicode_escape(size_t& identifier_length) const
 {
     GenericLexer lexer(source().substring_view(m_position - 1));
 
-    if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
+    if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) {
         identifier_length = lexer.tell();
         return code_point_or_error.value();
     }
@@ -384,13 +387,18 @@ Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
     return {};
 }
 
+// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart
+//          UnicodeIDStart
+//          $
+//          _
+//          \ UnicodeEscapeSequence
 Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
 {
     u32 code_point = current_code_point();
     identifier_length = 1;
 
     if (code_point == '\\') {
-        if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
+        if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
             code_point = *maybe_code_point;
         else
             return {};
@@ -406,13 +414,19 @@ Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
     return {};
 }
 
+// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart
+//          UnicodeIDContinue
+//          $
+//          \ UnicodeEscapeSequence
+//          <ZWNJ>
+//          <ZWJ>
 Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
 {
     u32 code_point = current_code_point();
     identifier_length = 1;
 
     if (code_point == '\\') {
-        if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
+        if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
             code_point = *maybe_code_point;
         else
             return {};
@@ -574,6 +588,7 @@ Token Lexer::next()
                 token_type = TokenType::TemplateLiteralString;
         }
     } else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
+        bool has_escaped_character = false;
         // identifier or keyword
         StringBuilder builder;
         do {
@@ -581,6 +596,8 @@ Token Lexer::next()
             for (size_t i = 0; i < identifier_length; ++i)
                 consume();
 
+            has_escaped_character |= identifier_length > 1;
+
             code_point = is_identifier_middle(identifier_length);
         } while (code_point.has_value());
 
@@ -592,7 +609,7 @@ Token Lexer::next()
         if (it == s_keywords.end())
             token_type = TokenType::Identifier;
         else
-            token_type = it->value;
+            token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value;
     } else if (is_numeric_literal_start()) {
         token_type = TokenType::NumericLiteral;
         bool is_invalid_numeric_literal = false;
diff --git a/Userland/Libraries/LibJS/Lexer.h b/Userland/Libraries/LibJS/Lexer.h
index ea4da7e14a..03991f596d 100644
--- a/Userland/Libraries/LibJS/Lexer.h
+++ b/Userland/Libraries/LibJS/Lexer.h
@@ -41,7 +41,7 @@ private:
     bool is_eof() const;
     bool is_line_terminator() const;
     bool is_whitespace() const;
-    Optional<u32> is_unicode_escape(size_t& identifier_length) const;
+    Optional<u32> is_identifier_unicode_escape(size_t& identifier_length) const;
     Optional<u32> is_identifier_start(size_t& identifier_length) const;
     Optional<u32> is_identifier_middle(size_t& identifier_length) const;
     bool is_line_comment_start(bool line_has_token_yet) const;
diff --git a/Userland/Libraries/LibJS/Parser.cpp b/Userland/Libraries/LibJS/Parser.cpp
index ec04f51657..68b5e53e09 100644
--- a/Userland/Libraries/LibJS/Parser.cpp
+++ b/Userland/Libraries/LibJS/Parser.cpp
@@ -404,6 +404,11 @@ NonnullRefPtr<Statement> Parser::parse_statement(AllowLabelledFunction allow_lab
         m_state.current_token = m_state.lexer.force_slash_as_regex();
         [[fallthrough]];
     default:
+        if (m_state.current_token.type() == TokenType::EscapedKeyword
+            && (m_state.strict_mode
+                || (m_state.current_token.value() != "yield"sv && m_state.current_token.value() != "let"sv)))
+            syntax_error("Keyword must not contain escaped characters");
+
         if (match_identifier_name()) {
             auto result = try_parse_labelled_statement(allow_labelled_function);
             if (!result.is_null())
@@ -545,7 +550,7 @@ RefPtr<Statement> Parser::try_parse_labelled_statement(AllowLabelledFunction all
         load_state();
     };
 
-    if (match(TokenType::Yield) && (m_state.strict_mode || m_state.in_generator_function_context)) {
+    if (m_state.current_token.value() == "yield"sv && (m_state.strict_mode || m_state.in_generator_function_context)) {
         syntax_error("'yield' label not allowed in this context");
         return {};
     }
@@ -604,7 +609,8 @@ RefPtr<MetaProperty> Parser::try_parse_new_target_expression()
     consume();
     if (!match(TokenType::Identifier))
         return {};
-    if (consume().value() != "target")
+    // The string 'target' cannot have escapes so we check original value.
+    if (consume().original_value() != "target"sv)
         return {};
 
     state_rollback_guard.disarm();
@@ -847,6 +853,9 @@ Parser::PrimaryExpressionParseResult Parser::parse_primary_expression()
         if (!m_state.allow_super_property_lookup)
             syntax_error("'super' keyword unexpected here");
         return { create_ast_node<SuperExpression>({ m_state.current_token.filename(), rule_start.position(), position() }) };
+    case TokenType::EscapedKeyword:
+        syntax_error("Keyword must not contain escaped characters");
+        [[fallthrough]];
     case TokenType::Identifier: {
     read_as_identifier:;
         if (!try_parse_arrow_function_expression_failed_at_position(position())) {
@@ -2800,6 +2809,14 @@ bool Parser::match_variable_declaration()
 
 bool Parser::match_identifier() const
 {
+    if (m_state.current_token.type() == TokenType::EscapedKeyword) {
+        if (m_state.current_token.value() == "let"sv)
+            return !m_state.strict_mode;
+        if (m_state.current_token.value() == "yield"sv)
+            return !m_state.strict_mode && !m_state.in_generator_function_context;
+        return true;
+    }
+
     return m_state.current_token.type() == TokenType::Identifier
         || (m_state.current_token.type() == TokenType::Let && !m_state.strict_mode)
         || (m_state.current_token.type() == TokenType::Yield && !m_state.in_generator_function_context && !m_state.strict_mode); // See note in Parser::parse_identifier().
@@ -2859,6 +2876,9 @@ Token Parser::consume_identifier()
     if (match(TokenType::Identifier))
         return consume(TokenType::Identifier);
 
+    if (match(TokenType::EscapedKeyword))
+        return consume(TokenType::EscapedKeyword);
+
     // Note that 'let' is not a reserved keyword, but our lexer considers it such
     // As it's pretty nice to have that (for syntax highlighting and such), we'll
     // special-case it here instead.
@@ -2884,6 +2904,16 @@ Token Parser::consume_identifier_reference()
     if (match(TokenType::Identifier))
         return consume(TokenType::Identifier);
 
+    if (match(TokenType::EscapedKeyword)) {
+        auto name = m_state.current_token.value();
+        if (name == "await"sv)
+            syntax_error("Identifier reference may not be 'await'");
+        else if (m_state.strict_mode && (name == "let"sv || name == "yield"sv))
+            syntax_error(String::formatted("'{}' is not allowed as an identifier in strict mode", name));
+
+        return consume();
+    }
+
     // See note in Parser::parse_identifier().
     if (match(TokenType::Let)) {
         if (m_state.strict_mode)
diff --git a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
index 4d089f21d9..a9a9a348a2 100644
--- a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
+++ b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
@@ -13,7 +13,74 @@ test("non-ascii escapes", () => {
     foo.𝓑𝓻𝓸𝔀𝓷 = 12389;
 
     expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
-    expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
     expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
     expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
+
+    // U-16 High surrogate pair is allowed in string but not in identifier.
+    expect("foo.𝓑𝓻\ud835\udcf8𝔀𝓷").toEval();
+    expect("foo.𝓑𝓻\\ud835\\udcf8𝔀𝓷").not.toEval();
+});
+
+describe("escaped keywords", () => {
+    // We must double escape the slashes here else the strings already convert
+    // the escaped characters (and string is more lenient).
+    test("keywords cannot be used in an escaped form", () => {
+        expect("\\u{69}\\u{66}(true) throw 'Should fail'").not.toEval();
+        expect("wh\\u{69}le(true) throw 'Should fail'").not.toEval();
+
+        expect("l\\u{65}t a = 3;").not.toEval();
+        expect("function *G(){ yiel\\0064 3; }").not.toEval();
+    });
+
+    test("escaped keywords cannot be used as standalone variables", () => {
+        expect("var fu\\u{6e}ction = 4").not.toEval();
+        expect("var \\u0077ith = 4").not.toEval();
+    });
+
+    test("'yield' and 'let' can be escaped as variables", () => {
+        var l\u{65}t = 3;
+        var yi\u0065ld = 5;
+        expect(let).toBe(3);
+        expect(yield).toBe(5);
+    });
+
+    test("'let' cannot be used in a lexical declaration but 'yield' can", () => {
+        expect("const l\\u{65}t = 3;").not.toEval();
+
+        const yi\u0065ld = 5;
+        expect(yield).toBe(5);
+    });
+
+    test("escaped 'yield' and 'let' variables are not allowed in strict mode", () => {
+        expect("function f() { 'use strict'; var l\\u{65}t = 3; }").not.toEval();
+        expect("function g() { 'use strict'; var yi\u0065ld = 5; }").not.toEval();
+    });
+
+    test("cannot use escaped 'yield' variable or label in generator context", () => {
+        expect("function *g() { var yi\u0065ld = 5; }").not.toEval();
+        expect("function *g() { yi\u0065ld: 5; }").not.toEval();
+    });
+
+    test("can use escaped 'let' variable and label in generator context", () => {
+        expect("function *i() { var \\u{6c}et = 6; }").toEval();
+        expect("function *j() { \\u{6c}et: 6; }").toEval();
+    });
+
+    test("can use keywords in some contexts", () => {
+        var obj = {
+            \u{69}\u{66}: 3,
+            wh\u{69}le() {
+                return 4;
+            },
+            ca\u0073e: "case",
+            get true() {
+                return false;
+            },
+        };
+
+        expect(obj.\u{69}f).toBe(3);
+        expect(obj.whi\u{6c}e()).toBe(4);
+        expect(obj.\u{63}ase).toBe("case");
+        expect(obj.\u0074r\u{0000075}e).toBeFalse();
+    });
 });
diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp
index bbce9d6e4f..b9c9ebb082 100644
--- a/Userland/Libraries/LibJS/Token.cpp
+++ b/Userland/Libraries/LibJS/Token.cpp
@@ -204,6 +204,7 @@ bool Token::is_identifier_name() const
     // The standard defines this reversed: Identifiers are IdentifierNames except reserved words
     // https://tc39.es/ecma262/#prod-Identifier
     return m_type == TokenType::Identifier
+        || m_type == TokenType::EscapedKeyword
         || m_type == TokenType::Await
         || m_type == TokenType::BoolLiteral
         || m_type == TokenType::Break
diff --git a/Userland/Libraries/LibJS/Token.h b/Userland/Libraries/LibJS/Token.h
index 12c58ae10c..da5161771c 100644
--- a/Userland/Libraries/LibJS/Token.h
+++ b/Userland/Libraries/LibJS/Token.h
@@ -74,6 +74,7 @@ constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
     __ENUMERATE_JS_TOKEN(Equals, Operator)                      \
     __ENUMERATE_JS_TOKEN(EqualsEquals, Operator)                \
     __ENUMERATE_JS_TOKEN(EqualsEqualsEquals, Operator)          \
+    __ENUMERATE_JS_TOKEN(EscapedKeyword, Identifier)            \
     __ENUMERATE_JS_TOKEN(ExclamationMark, Operator)             \
     __ENUMERATE_JS_TOKEN(ExclamationMarkEquals, Operator)       \
     __ENUMERATE_JS_TOKEN(ExclamationMarkEqualsEquals, Operator) \
author	davidot <david.tuin@gmail.com>	2021-08-21 11:27:20 +0200
committer	Linus Groh <mail@linusgroh.de>	2021-08-24 07:42:37 +0100
commit	7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7 (patch)
tree	6b79d356efa5e870b5d72052cb431b8f3dce6b4b
parent	b012170d69c5d6a846034591b3a064d12346f8e1 (diff)
download	serenity-7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7.zip