LibJS: Support LegacyOctalEscapeSequence in string literals

https://tc39.es/ecma262/#sec-additional-syntax-string-literals The syntax and semantics of 11.8.4 is extended as follows except that this extension is not allowed for strict mode code: Syntax EscapeSequence:: CharacterEscapeSequence LegacyOctalEscapeSequence NonOctalDecimalEscapeSequence HexEscapeSequence UnicodeEscapeSequence LegacyOctalEscapeSequence:: OctalDigit [lookahead ∉ OctalDigit] ZeroToThree OctalDigit [lookahead ∉ OctalDigit] FourToSeven OctalDigit ZeroToThree OctalDigit OctalDigit ZeroToThree :: one of 0 1 2 3 FourToSeven :: one of 4 5 6 7 NonOctalDecimalEscapeSequence :: one of 8 9 This definition of EscapeSequence is not used in strict mode or when parsing TemplateCharacter. Note It is possible for string literals to precede a Use Strict Directive that places the enclosing code in strict mode, and implementations must take care to not use this extended definition of EscapeSequence with such literals. For example, attempting to parse the following source text must fail: function invalid() { "\7"; "use strict"; }
author: Linus Groh <mail@linusgroh.de> 2020-10-24 13:30:57 +0100
committer: Andreas Kling <kling@serenityos.org> 2020-10-24 16:34:01 +0200
commit: 4fb96afafc6e87b75d2e310f949b0ca7b337b050 (patch)
tree: e654afea4bc7dd536d3d0b46182a3ed477e57fc2
parent: 9f036959e867e52bbec12a384e00f21ee0a07be2 (diff)
download: serenity-4fb96afafc6e87b75d2e310f949b0ca7b337b050.zip
5 files changed, 104 insertions, 11 deletions
diff --git a/Libraries/LibJS/Parser.cpp b/Libraries/LibJS/Parser.cpp
index 806d4aa73b..d8ab3f7a40 100644
--- a/Libraries/LibJS/Parser.cpp
+++ b/Libraries/LibJS/Parser.cpp
@@ -836,23 +836,41 @@ NonnullRefPtr<ArrayExpression> Parser::parse_array_expression()
     return create_ast_node<ArrayExpression>(move(elements));
 }
 
-NonnullRefPtr<StringLiteral> Parser::parse_string_literal(Token token)
+NonnullRefPtr<StringLiteral> Parser::parse_string_literal(Token token, bool in_template_literal)
 {
     auto status = Token::StringValueStatus::Ok;
     auto string = token.string_value(status);
     if (status != Token::StringValueStatus::Ok) {
         String message;
-        if (status == Token::StringValueStatus::MalformedHexEscape || status == Token::StringValueStatus::MalformedUnicodeEscape) {
+        if (status == Token::StringValueStatus::LegacyOctalEscapeSequence) {
+            m_parser_state.m_string_legacy_octal_escape_sequence_in_scope = true;
+            if (in_template_literal)
+                message = "Octal escape sequence not allowed in template literal";
+            else if (m_parser_state.m_strict_mode)
+                message = "Octal escape sequence in string literal not allowed in strict mode";
+        } else if (status == Token::StringValueStatus::MalformedHexEscape || status == Token::StringValueStatus::MalformedUnicodeEscape) {
             auto type = status == Token::StringValueStatus::MalformedUnicodeEscape ? "unicode" : "hexadecimal";
             message = String::formatted("Malformed {} escape sequence", type);
         } else if (status == Token::StringValueStatus::UnicodeEscapeOverflow) {
             message = "Unicode code_point must not be greater than 0x10ffff in escape sequence";
+        } else {
+            ASSERT_NOT_REACHED();
         }
 
         if (!message.is_empty())
             syntax_error(message, token.line_number(), token.line_column());
     }
 
+    // It is possible for string literals to precede a Use Strict Directive that places the
+    // enclosing code in strict mode, and implementations must take care to not use this
+    // extended definition of EscapeSequence with such literals. For example, attempting to
+    // parse the following source text must fail:
+    //
+    // function invalid() { "\7"; "use strict"; }
+
+    if (m_parser_state.m_string_legacy_octal_escape_sequence_in_scope && string == "use strict")
+        syntax_error("Octal escape sequence in string literal not allowed in strict mode");
+
     if (m_parser_state.m_use_strict_directive == UseStrictDirectiveState::Looking) {
         if (string == "use strict" && token.type() != TokenType::TemplateLiteralString) {
             m_parser_state.m_use_strict_directive = UseStrictDirectiveState::Found;
@@ -884,7 +902,7 @@ NonnullRefPtr<TemplateLiteral> Parser::parse_template_literal(bool is_tagged)
     while (!done() && !match(TokenType::TemplateLiteralEnd) && !match(TokenType::UnterminatedTemplateLiteral)) {
         if (match(TokenType::TemplateLiteralString)) {
             auto token = consume();
-            expressions.append(parse_string_literal(token));
+            expressions.append(parse_string_literal(token, true));
             if (is_tagged)
                 raw_strings.append(create_ast_node<StringLiteral>(token.value()));
         } else if (match(TokenType::TemplateLiteralExprStart)) {
@@ -1249,6 +1267,7 @@ NonnullRefPtr<BlockStatement> Parser::parse_block_statement(bool& is_strict)
         first = false;
     }
     m_parser_state.m_strict_mode = initial_strict_mode_state;
+    m_parser_state.m_string_legacy_octal_escape_sequence_in_scope = false;
     consume(TokenType::CurlyClose);
     block->add_variables(m_parser_state.m_let_scopes.last());
     block->add_functions(m_parser_state.m_function_scopes.last());
diff --git a/Libraries/LibJS/Parser.h b/Libraries/LibJS/Parser.h
index 007e8b6879..565f3ff704 100644
--- a/Libraries/LibJS/Parser.h
+++ b/Libraries/LibJS/Parser.h
@@ -87,7 +87,7 @@ public:
     NonnullRefPtr<RegExpLiteral> parse_regexp_literal();
     NonnullRefPtr<ObjectExpression> parse_object_expression();
     NonnullRefPtr<ArrayExpression> parse_array_expression();
-    NonnullRefPtr<StringLiteral> parse_string_literal(Token token);
+    NonnullRefPtr<StringLiteral> parse_string_literal(Token token, bool in_template_literal = false);
     NonnullRefPtr<TemplateLiteral> parse_template_literal(bool is_tagged);
     NonnullRefPtr<Expression> parse_secondary_expression(NonnullRefPtr<Expression>, int min_precedence, Associativity associate = Associativity::Right);
     NonnullRefPtr<CallExpression> parse_call_expression(NonnullRefPtr<Expression>);
@@ -184,6 +184,7 @@ private:
         bool m_in_function_context { false };
         bool m_in_break_context { false };
         bool m_in_continue_context { false };
+        bool m_string_legacy_octal_escape_sequence_in_scope { false };
 
         explicit ParserState(Lexer);
     };
diff --git a/Libraries/LibJS/Tests/string-escapes.js b/Libraries/LibJS/Tests/string-escapes.js
index e4abf9ec07..d8c75e9055 100644
--- a/Libraries/LibJS/Tests/string-escapes.js
+++ b/Libraries/LibJS/Tests/string-escapes.js
@@ -13,3 +13,32 @@ test("unicode escapes", () => {
     expect(`\u{1f41e}`).toBe("🐞");
     expect("\u00ff").toBe(String.fromCharCode(0xff));
 });
+
+describe("octal escapes", () => {
+    test("basic functionality", () => {
+        expect("\1").toBe("\u0001");
+        expect("\2").toBe("\u0002");
+        expect("\3").toBe("\u0003");
+        expect("\4").toBe("\u0004");
+        expect("\5").toBe("\u0005");
+        expect("\6").toBe("\u0006");
+        expect("\7").toBe("\u0007");
+        expect("\8").toBe("8");
+        expect("\9").toBe("9");
+        expect("\128").toBe("\n8");
+        expect("\141bc").toBe("abc");
+        expect("f\157o\142a\162").toBe("foobar");
+        expect("\123\145\162\145\156\151\164\171\117\123").toBe("SerenityOS");
+    });
+
+    test("syntax error in template literal", () => {
+        expect("`\\123`").not.toEval();
+    });
+
+    test("syntax error in strict mode", () => {
+        expect("'use strict'; '\\123'").not.toEval();
+        expect('"use strict"; "\\123"').not.toEval();
+        // Special case, string literal precedes use strict directive
+        expect("'\\123'; somethingElse; 'use strict'").not.toEval();
+    });
+});
diff --git a/Libraries/LibJS/Token.cpp b/Libraries/LibJS/Token.cpp
index 0921ee9e14..57190ef487 100644
--- a/Libraries/LibJS/Token.cpp
+++ b/Libraries/LibJS/Token.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@gmx.de>
+ * Copyright (c) 2020, Linus Groh <mail@linusgroh.de>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -103,8 +104,19 @@ String Token::string_value(StringValueStatus& status) const
 {
     ASSERT(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);
     auto is_template = type() == TokenType::TemplateLiteralString;
+    auto offset = is_template ? 0 : 1;
 
-    auto offset = type() == TokenType::TemplateLiteralString ? 0 : 1;
+    size_t i;
+
+    auto lookahead = [&]<typename T>(T fn, size_t distance = 1) -> bool {
+        if (i + distance >= m_value.length() - offset)
+            return false;
+        return fn(m_value[i + distance]);
+    };
+
+    auto is_octal_digit = [](char c) {
+        return c >= '0' && c <= '7';
+    };
 
     auto encoding_failure = [&status](StringValueStatus parse_status) -> String {
         status = parse_status;
@@ -112,7 +124,7 @@ String Token::string_value(StringValueStatus& status) const
     };
 
     StringBuilder builder;
-    for (size_t i = offset; i < m_value.length() - offset; ++i) {
+    for (i = offset; i < m_value.length() - offset; ++i) {
         if (m_value[i] == '\\' && i + 1 < m_value.length() - offset) {
             i++;
             switch (m_value[i]) {
@@ -134,9 +146,6 @@ String Token::string_value(StringValueStatus& status) const
             case 'v':
                 builder.append('\v');
                 break;
-            case '0':
-                builder.append((char)0);
-                break;
             case '\'':
                 builder.append('\'');
                 break;
@@ -200,9 +209,43 @@ String Token::string_value(StringValueStatus& status) const
                     builder.append(m_value[i]);
                     break;
                 }
+                if (m_value[i] == '0' && !lookahead(isdigit)) {
+                    builder.append((char)0);
+                    break;
+                }
 
-                // FIXME: Also parse octal. Should anything else generate a syntax error?
-                builder.append(m_value[i]);
+                // In non-strict mode LegacyOctalEscapeSequence is allowed in strings:
+                // https://tc39.es/ecma262/#sec-additional-syntax-string-literals
+                String octal_str;
+
+                // OctalDigit [lookahead ∉ OctalDigit]
+                if (is_octal_digit(m_value[i]) && !lookahead(is_octal_digit)) {
+                    status = StringValueStatus::LegacyOctalEscapeSequence;
+                    octal_str = String(&m_value[i], 1);
+                }
+                // ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
+                else if (m_value[i] >= '0' && m_value[i] <= '3' && lookahead(is_octal_digit) && !lookahead(is_octal_digit, 2)) {
+                    status = StringValueStatus::LegacyOctalEscapeSequence;
+                    octal_str = String(m_value.substring_view(i, 2));
+                    i++;
+                }
+                // FourToSeven OctalDigit
+                else if (m_value[i] >= '4' && m_value[i] <= '7' && lookahead(is_octal_digit)) {
+                    status = StringValueStatus::LegacyOctalEscapeSequence;
+                    octal_str = String(m_value.substring_view(i, 2));
+                    i++;
+                }
+                // ZeroToThree OctalDigit OctalDigit
+                else if (m_value[i] >= '0' && m_value[i] <= '3' && lookahead(is_octal_digit) && lookahead(is_octal_digit, 2)) {
+                    status = StringValueStatus::LegacyOctalEscapeSequence;
+                    octal_str = String(m_value.substring_view(i, 3));
+                    i += 2;
+                }
+
+                if (status == StringValueStatus::LegacyOctalEscapeSequence)
+                    builder.append_code_point(strtoul(octal_str.characters(), nullptr, 8));
+                else
+                    builder.append(m_value[i]);
             }
         } else {
             builder.append(m_value[i]);
diff --git a/Libraries/LibJS/Token.h b/Libraries/LibJS/Token.h
index 3b967f9775..b9d4bb0d60 100644
--- a/Libraries/LibJS/Token.h
+++ b/Libraries/LibJS/Token.h
@@ -208,6 +208,7 @@ public:
         MalformedHexEscape,
         MalformedUnicodeEscape,
         UnicodeEscapeOverflow,
+        LegacyOctalEscapeSequence,
     };
     String string_value(StringValueStatus& status) const;
author	Linus Groh <mail@linusgroh.de>	2020-10-24 13:30:57 +0100
committer	Andreas Kling <kling@serenityos.org>	2020-10-24 16:34:01 +0200
commit	4fb96afafc6e87b75d2e310f949b0ca7b337b050 (patch)
tree	e654afea4bc7dd536d3d0b46182a3ed477e57fc2
parent	9f036959e867e52bbec12a384e00f21ee0a07be2 (diff)
download	serenity-4fb96afafc6e87b75d2e310f949b0ca7b337b050.zip