summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordavidot <david.tuin@gmail.com>2021-08-21 11:27:20 +0200
committerLinus Groh <mail@linusgroh.de>2021-08-24 07:42:37 +0100
commit7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7 (patch)
tree6b79d356efa5e870b5d72052cb431b8f3dce6b4b
parentb012170d69c5d6a846034591b3a064d12346f8e1 (diff)
downloadserenity-7bcffd1b6ad82bf8986a9e99ccbd3cd9900482a7.zip
LibJS: Fix some small remaining issues with parsing unicode escapes
Added a test to ensure the behavior stays the same. We now throw on a direct usage of an escaped keywords with a specific error to make it more clear to the user.
-rw-r--r--Userland/Libraries/LibJS/Lexer.cpp27
-rw-r--r--Userland/Libraries/LibJS/Lexer.h2
-rw-r--r--Userland/Libraries/LibJS/Parser.cpp34
-rw-r--r--Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js69
-rw-r--r--Userland/Libraries/LibJS/Token.cpp1
-rw-r--r--Userland/Libraries/LibJS/Token.h1
6 files changed, 125 insertions, 9 deletions
diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp
index 11071b98fe..450755f21f 100644
--- a/Userland/Libraries/LibJS/Lexer.cpp
+++ b/Userland/Libraries/LibJS/Lexer.cpp
@@ -372,11 +372,14 @@ bool Lexer::is_whitespace() const
return false;
}
-Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
+// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence
+// u Hex4Digits
+// u{ CodePoint }
+Optional<u32> Lexer::is_identifier_unicode_escape(size_t& identifier_length) const
{
GenericLexer lexer(source().substring_view(m_position - 1));
- if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
+ if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) {
identifier_length = lexer.tell();
return code_point_or_error.value();
}
@@ -384,13 +387,18 @@ Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
return {};
}
+// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart
+// UnicodeIDStart
+// $
+// _
+// \ UnicodeEscapeSequence
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
- if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
+ if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
@@ -406,13 +414,19 @@ Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
return {};
}
+// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart
+// UnicodeIDContinue
+// $
+// \ UnicodeEscapeSequence
+// <ZWNJ>
+// <ZWJ>
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
- if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
+ if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
@@ -574,6 +588,7 @@ Token Lexer::next()
token_type = TokenType::TemplateLiteralString;
}
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
+ bool has_escaped_character = false;
// identifier or keyword
StringBuilder builder;
do {
@@ -581,6 +596,8 @@ Token Lexer::next()
for (size_t i = 0; i < identifier_length; ++i)
consume();
+ has_escaped_character |= identifier_length > 1;
+
code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());
@@ -592,7 +609,7 @@ Token Lexer::next()
if (it == s_keywords.end())
token_type = TokenType::Identifier;
else
- token_type = it->value;
+ token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value;
} else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false;
diff --git a/Userland/Libraries/LibJS/Lexer.h b/Userland/Libraries/LibJS/Lexer.h
index ea4da7e14a..03991f596d 100644
--- a/Userland/Libraries/LibJS/Lexer.h
+++ b/Userland/Libraries/LibJS/Lexer.h
@@ -41,7 +41,7 @@ private:
bool is_eof() const;
bool is_line_terminator() const;
bool is_whitespace() const;
- Optional<u32> is_unicode_escape(size_t& identifier_length) const;
+ Optional<u32> is_identifier_unicode_escape(size_t& identifier_length) const;
Optional<u32> is_identifier_start(size_t& identifier_length) const;
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
bool is_line_comment_start(bool line_has_token_yet) const;
diff --git a/Userland/Libraries/LibJS/Parser.cpp b/Userland/Libraries/LibJS/Parser.cpp
index ec04f51657..68b5e53e09 100644
--- a/Userland/Libraries/LibJS/Parser.cpp
+++ b/Userland/Libraries/LibJS/Parser.cpp
@@ -404,6 +404,11 @@ NonnullRefPtr<Statement> Parser::parse_statement(AllowLabelledFunction allow_lab
m_state.current_token = m_state.lexer.force_slash_as_regex();
[[fallthrough]];
default:
+ if (m_state.current_token.type() == TokenType::EscapedKeyword
+ && (m_state.strict_mode
+ || (m_state.current_token.value() != "yield"sv && m_state.current_token.value() != "let"sv)))
+ syntax_error("Keyword must not contain escaped characters");
+
if (match_identifier_name()) {
auto result = try_parse_labelled_statement(allow_labelled_function);
if (!result.is_null())
@@ -545,7 +550,7 @@ RefPtr<Statement> Parser::try_parse_labelled_statement(AllowLabelledFunction all
load_state();
};
- if (match(TokenType::Yield) && (m_state.strict_mode || m_state.in_generator_function_context)) {
+ if (m_state.current_token.value() == "yield"sv && (m_state.strict_mode || m_state.in_generator_function_context)) {
syntax_error("'yield' label not allowed in this context");
return {};
}
@@ -604,7 +609,8 @@ RefPtr<MetaProperty> Parser::try_parse_new_target_expression()
consume();
if (!match(TokenType::Identifier))
return {};
- if (consume().value() != "target")
+ // The string 'target' cannot have escapes so we check original value.
+ if (consume().original_value() != "target"sv)
return {};
state_rollback_guard.disarm();
@@ -847,6 +853,9 @@ Parser::PrimaryExpressionParseResult Parser::parse_primary_expression()
if (!m_state.allow_super_property_lookup)
syntax_error("'super' keyword unexpected here");
return { create_ast_node<SuperExpression>({ m_state.current_token.filename(), rule_start.position(), position() }) };
+ case TokenType::EscapedKeyword:
+ syntax_error("Keyword must not contain escaped characters");
+ [[fallthrough]];
case TokenType::Identifier: {
read_as_identifier:;
if (!try_parse_arrow_function_expression_failed_at_position(position())) {
@@ -2800,6 +2809,14 @@ bool Parser::match_variable_declaration()
bool Parser::match_identifier() const
{
+ if (m_state.current_token.type() == TokenType::EscapedKeyword) {
+ if (m_state.current_token.value() == "let"sv)
+ return !m_state.strict_mode;
+ if (m_state.current_token.value() == "yield"sv)
+ return !m_state.strict_mode && !m_state.in_generator_function_context;
+ return true;
+ }
+
return m_state.current_token.type() == TokenType::Identifier
|| (m_state.current_token.type() == TokenType::Let && !m_state.strict_mode)
|| (m_state.current_token.type() == TokenType::Yield && !m_state.in_generator_function_context && !m_state.strict_mode); // See note in Parser::parse_identifier().
@@ -2859,6 +2876,9 @@ Token Parser::consume_identifier()
if (match(TokenType::Identifier))
return consume(TokenType::Identifier);
+ if (match(TokenType::EscapedKeyword))
+ return consume(TokenType::EscapedKeyword);
+
// Note that 'let' is not a reserved keyword, but our lexer considers it such
// As it's pretty nice to have that (for syntax highlighting and such), we'll
// special-case it here instead.
@@ -2884,6 +2904,16 @@ Token Parser::consume_identifier_reference()
if (match(TokenType::Identifier))
return consume(TokenType::Identifier);
+ if (match(TokenType::EscapedKeyword)) {
+ auto name = m_state.current_token.value();
+ if (name == "await"sv)
+ syntax_error("Identifier reference may not be 'await'");
+ else if (m_state.strict_mode && (name == "let"sv || name == "yield"sv))
+ syntax_error(String::formatted("'{}' is not allowed as an identifier in strict mode", name));
+
+ return consume();
+ }
+
// See note in Parser::parse_identifier().
if (match(TokenType::Let)) {
if (m_state.strict_mode)
diff --git a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
index 4d089f21d9..a9a9a348a2 100644
--- a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
+++ b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
@@ -13,7 +13,74 @@ test("non-ascii escapes", () => {
foo.𝓑𝓻𝓸𝔀𝓷 = 12389;
expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
- expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
+
+ // U-16 High surrogate pair is allowed in string but not in identifier.
+ expect("foo.𝓑𝓻\ud835\udcf8𝔀𝓷").toEval();
+ expect("foo.𝓑𝓻\\ud835\\udcf8𝔀𝓷").not.toEval();
+});
+
+describe("escaped keywords", () => {
+ // We must double escape the slashes here else the strings already convert
+ // the escaped characters (and string is more lenient).
+ test("keywords cannot be used in an escaped form", () => {
+ expect("\\u{69}\\u{66}(true) throw 'Should fail'").not.toEval();
+ expect("wh\\u{69}le(true) throw 'Should fail'").not.toEval();
+
+ expect("l\\u{65}t a = 3;").not.toEval();
+ expect("function *G(){ yiel\\0064 3; }").not.toEval();
+ });
+
+ test("escaped keywords cannot be used as standalone variables", () => {
+ expect("var fu\\u{6e}ction = 4").not.toEval();
+ expect("var \\u0077ith = 4").not.toEval();
+ });
+
+ test("'yield' and 'let' can be escaped as variables", () => {
+ var l\u{65}t = 3;
+ var yi\u0065ld = 5;
+ expect(let).toBe(3);
+ expect(yield).toBe(5);
+ });
+
+ test("'let' cannot be used in a lexical declaration but 'yield' can", () => {
+ expect("const l\\u{65}t = 3;").not.toEval();
+
+ const yi\u0065ld = 5;
+ expect(yield).toBe(5);
+ });
+
+ test("escaped 'yield' and 'let' variables are not allowed in strict mode", () => {
+ expect("function f() { 'use strict'; var l\\u{65}t = 3; }").not.toEval();
+ expect("function g() { 'use strict'; var yi\u0065ld = 5; }").not.toEval();
+ });
+
+ test("cannot use escaped 'yield' variable or label in generator context", () => {
+ expect("function *g() { var yi\u0065ld = 5; }").not.toEval();
+ expect("function *g() { yi\u0065ld: 5; }").not.toEval();
+ });
+
+ test("can use escaped 'let' variable and label in generator context", () => {
+ expect("function *i() { var \\u{6c}et = 6; }").toEval();
+ expect("function *j() { \\u{6c}et: 6; }").toEval();
+ });
+
+ test("can use keywords in some contexts", () => {
+ var obj = {
+ \u{69}\u{66}: 3,
+ wh\u{69}le() {
+ return 4;
+ },
+ ca\u0073e: "case",
+ get true() {
+ return false;
+ },
+ };
+
+ expect(obj.\u{69}f).toBe(3);
+ expect(obj.whi\u{6c}e()).toBe(4);
+ expect(obj.\u{63}ase).toBe("case");
+ expect(obj.\u0074r\u{0000075}e).toBeFalse();
+ });
});
diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp
index bbce9d6e4f..b9c9ebb082 100644
--- a/Userland/Libraries/LibJS/Token.cpp
+++ b/Userland/Libraries/LibJS/Token.cpp
@@ -204,6 +204,7 @@ bool Token::is_identifier_name() const
// The standard defines this reversed: Identifiers are IdentifierNames except reserved words
// https://tc39.es/ecma262/#prod-Identifier
return m_type == TokenType::Identifier
+ || m_type == TokenType::EscapedKeyword
|| m_type == TokenType::Await
|| m_type == TokenType::BoolLiteral
|| m_type == TokenType::Break
diff --git a/Userland/Libraries/LibJS/Token.h b/Userland/Libraries/LibJS/Token.h
index 12c58ae10c..da5161771c 100644
--- a/Userland/Libraries/LibJS/Token.h
+++ b/Userland/Libraries/LibJS/Token.h
@@ -74,6 +74,7 @@ constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
__ENUMERATE_JS_TOKEN(Equals, Operator) \
__ENUMERATE_JS_TOKEN(EqualsEquals, Operator) \
__ENUMERATE_JS_TOKEN(EqualsEqualsEquals, Operator) \
+ __ENUMERATE_JS_TOKEN(EscapedKeyword, Identifier) \
__ENUMERATE_JS_TOKEN(ExclamationMark, Operator) \
__ENUMERATE_JS_TOKEN(ExclamationMarkEquals, Operator) \
__ENUMERATE_JS_TOKEN(ExclamationMarkEqualsEquals, Operator) \