diff options
author | Matthew Olsson <matthewcolsson@gmail.com> | 2020-05-16 23:27:25 -0700 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-05-18 17:58:17 +0200 |
commit | e415dd4e9c11fe065f62909ef25266609a2678dc (patch) | |
tree | 2bdfa3ab5152f4aea0261240c08acd4457664d14 | |
parent | b3090678a9d9d1a8bda18a1781557d0547882da9 (diff) | |
download | serenity-e415dd4e9c11fe065f62909ef25266609a2678dc.zip |
LibJS: Handle hex and unicode escape sequences in string literals
Introduces the following syntax:
'\x55'
'\u26a0'
'\u{1f41e}'
-rw-r--r-- | Libraries/LibJS/Parser.cpp | 28 | ||||
-rw-r--r-- | Libraries/LibJS/Parser.h | 1 | ||||
-rw-r--r-- | Libraries/LibJS/Tests/string-escapes.js | 17 | ||||
-rw-r--r-- | Libraries/LibJS/Token.cpp | 72 | ||||
-rw-r--r-- | Libraries/LibJS/Token.h | 9 |
5 files changed, 118 insertions, 9 deletions
diff --git a/Libraries/LibJS/Parser.cpp b/Libraries/LibJS/Parser.cpp index bafb72454a..3b1343079f 100644 --- a/Libraries/LibJS/Parser.cpp +++ b/Libraries/LibJS/Parser.cpp @@ -405,7 +405,7 @@ NonnullRefPtr<Expression> Parser::parse_primary_expression() case TokenType::BoolLiteral: return create_ast_node<BooleanLiteral>(consume().bool_value()); case TokenType::StringLiteral: - return create_ast_node<StringLiteral>(consume().string_value()); + return parse_string_literal(consume()); case TokenType::NullLiteral: consume(); return create_ast_node<NullLiteral>(); @@ -494,7 +494,7 @@ NonnullRefPtr<ObjectExpression> Parser::parse_object_expression() property_value = create_ast_node<Identifier>(identifier); need_colon = false; } else if (match(TokenType::StringLiteral)) { - property_key = create_ast_node<StringLiteral>(consume(TokenType::StringLiteral).string_value()); + property_key = parse_string_literal(consume()); } else if (match(TokenType::NumericLiteral)) { property_key = create_ast_node<StringLiteral>(consume(TokenType::NumericLiteral).value()); } else if (match(TokenType::BracketOpen)) { @@ -559,6 +559,28 @@ NonnullRefPtr<ArrayExpression> Parser::parse_array_expression() return create_ast_node<ArrayExpression>(move(elements)); } +NonnullRefPtr<StringLiteral> Parser::parse_string_literal(Token token) +{ + auto status = Token::StringValueStatus::Ok; + auto string = token.string_value(status); + if (status != Token::StringValueStatus::Ok) { + String message; + if (status == Token::StringValueStatus::MalformedHexEscape || status == Token::StringValueStatus::MalformedUnicodeEscape) { + auto type = status == Token::StringValueStatus::MalformedUnicodeEscape ? "unicode" : "hexadecimal"; + message = String::format("Malformed %s escape sequence", type); + } else if (status == Token::StringValueStatus::UnicodeEscapeOverflow) { + message = "Unicode codepoint must not be greater than 0x10ffff in escape sequence"; + } + + syntax_error( + message, + m_parser_state.m_current_token.line_number(), + m_parser_state.m_current_token.line_column() + ); + } + return create_ast_node<StringLiteral>(string); +} + NonnullRefPtr<TemplateLiteral> Parser::parse_template_literal(bool is_tagged) { consume(TokenType::TemplateLiteralStart); @@ -579,7 +601,7 @@ NonnullRefPtr<TemplateLiteral> Parser::parse_template_literal(bool is_tagged) while (!match(TokenType::TemplateLiteralEnd) && !match(TokenType::UnterminatedTemplateLiteral)) { if (match(TokenType::TemplateLiteralString)) { auto token = consume(); - expressions.append(create_ast_node<StringLiteral>(token.string_value())); + expressions.append(parse_string_literal(token)); if (is_tagged) raw_strings.append(create_ast_node<StringLiteral>(token.value())); } else if (match(TokenType::TemplateLiteralExprStart)) { diff --git a/Libraries/LibJS/Parser.h b/Libraries/LibJS/Parser.h index 91f90953ea..1f5d2f4fec 100644 --- a/Libraries/LibJS/Parser.h +++ b/Libraries/LibJS/Parser.h @@ -70,6 +70,7 @@ public: NonnullRefPtr<Expression> parse_unary_prefixed_expression(); NonnullRefPtr<ObjectExpression> parse_object_expression(); NonnullRefPtr<ArrayExpression> parse_array_expression(); + NonnullRefPtr<StringLiteral> parse_string_literal(Token token); NonnullRefPtr<TemplateLiteral> parse_template_literal(bool is_tagged); NonnullRefPtr<Expression> parse_secondary_expression(NonnullRefPtr<Expression>, int min_precedence, Associativity associate = Associativity::Right); NonnullRefPtr<CallExpression> parse_call_expression(NonnullRefPtr<Expression>); diff --git a/Libraries/LibJS/Tests/string-escapes.js b/Libraries/LibJS/Tests/string-escapes.js new file mode 100644 index 0000000000..ad9f47ea12 --- /dev/null +++ b/Libraries/LibJS/Tests/string-escapes.js @@ -0,0 +1,17 @@ +load("test-common.js") + +try { + assert("\x55" === "U"); + assert("\X55" === "X55"); + assert(`\x55` === "U"); + assert(`\X55` === "X55"); + + assert("\u26a0" === "⚠"); + assert(`\u26a0` === "⚠"); + assert("\u{1f41e}" === "🐞"); + assert(`\u{1f41e}` === "🐞"); + + console.log("PASS"); +} catch (e) { + console.log("FAIL: " + e); +} diff --git a/Libraries/LibJS/Token.cpp b/Libraries/LibJS/Token.cpp index ffb95908a5..cc3a1ee0eb 100644 --- a/Libraries/LibJS/Token.cpp +++ b/Libraries/LibJS/Token.cpp @@ -27,6 +27,7 @@ #include "Token.h" #include <AK/Assertions.h> #include <AK/StringBuilder.h> +#include <AK/Utf32View.h> #include <ctype.h> namespace JS { @@ -72,13 +73,26 @@ double Token::double_value() const return strtod(value_string.characters(), nullptr); } -String Token::string_value() const +static u32 hex2int(char x) +{ + ASSERT(isxdigit(x)); + if (x >= '0' && x <= '9') + return x - '0'; + return 10u + (tolower(x) - 'a'); +} + +String Token::string_value(StringValueStatus& status) const { ASSERT(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString); auto is_template = type() == TokenType::TemplateLiteralString; auto offset = type() == TokenType::TemplateLiteralString ? 0 : 1; + auto encoding_failure = [&status](StringValueStatus parse_status) -> String { + status = parse_status; + return {}; + }; + StringBuilder builder; for (size_t i = offset; i < m_value.length() - offset; ++i) { if (m_value[i] == '\\' && i + 1 < m_value.length() - offset) { @@ -114,14 +128,62 @@ String Token::string_value() const case '\\': builder.append('\\'); break; + case 'x': { + if (i + 2 >= m_value.length() - offset) + return encoding_failure(StringValueStatus::MalformedHexEscape); + + auto digit1 = m_value[++i]; + auto digit2 = m_value[++i]; + if (!isxdigit(digit1) || !isxdigit(digit2)) + return encoding_failure(StringValueStatus::MalformedHexEscape); + builder.append(static_cast<char>(hex2int(digit1) * 16 + hex2int(digit2))); + break; + } + case 'u': { + if (i + 1 >= m_value.length() - offset) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + u32 code_point = m_value[++i]; + + if (code_point == '{') { + code_point = 0; + do { + if (i + 1 >= m_value.length() - offset) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + + auto ch = m_value[++i]; + if (!isxdigit(ch)) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + + auto new_code_point = (code_point << 4u) | hex2int(ch); + if (new_code_point < code_point) + return encoding_failure(StringValueStatus::UnicodeEscapeOverflow); + code_point = new_code_point; + } while (m_value[i + 1] != '}'); + ++i; + } else { + if (i + 3 >= m_value.length() - offset || !isxdigit(code_point)) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + + code_point = hex2int(code_point); + for (int j = 0; j < 3; ++j) { + auto ch = m_value[++i]; + if (!isxdigit(ch)) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + code_point = (code_point << 4u) | hex2int(ch); + } + } + + builder.append({ &code_point, 1 }); + break; + } default: if (is_template && (m_value[i] == '$' || m_value[i] == '`')) { builder.append(m_value[i]); - } else { - // FIXME: Also parse octal, hex and unicode sequences - // should anything else generate a syntax error? - builder.append(m_value[i]); + break; } + + // FIXME: Also parse octal. Should anything else generate a syntax error? + builder.append(m_value[i]); } } else { builder.append(m_value[i]); diff --git a/Libraries/LibJS/Token.h b/Libraries/LibJS/Token.h index c66d3c5431..884a249f27 100644 --- a/Libraries/LibJS/Token.h +++ b/Libraries/LibJS/Token.h @@ -172,9 +172,16 @@ public: size_t line_number() const { return m_line_number; } size_t line_column() const { return m_line_column; } double double_value() const; - String string_value() const; bool bool_value() const; + enum class StringValueStatus { + Ok, + MalformedHexEscape, + MalformedUnicodeEscape, + UnicodeEscapeOverflow, + }; + String string_value(StringValueStatus& status) const; + bool is_identifier_name() const; private: |