diff options
author | davidot <david.tuin@gmail.com> | 2021-08-14 17:07:47 +0200 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-08-16 23:20:04 +0100 |
commit | 47bc72bcf6f2fae3dc0938b96d8c7890b1005ead (patch) | |
tree | 61e9fe09d1cb3a012616e7744e15c6d7491c4ee6 | |
parent | 4d6502de423bdf7a278d67fa2ebb515fd834ce41 (diff) | |
download | serenity-47bc72bcf6f2fae3dc0938b96d8c7890b1005ead.zip |
LibJS: Correctly handle Unicode characters in JS source text
Also recognize additional white space characters.
-rw-r--r-- | Userland/Libraries/LibJS/Lexer.cpp | 84 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Lexer.h | 5 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Parser.h | 4 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp | 4 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Token.cpp | 4 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Token.h | 15 |
6 files changed, 100 insertions, 16 deletions
diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp index 9ae60a7e77..e50e0a3956 100644 --- a/Userland/Libraries/LibJS/Lexer.cpp +++ b/Userland/Libraries/LibJS/Lexer.cpp @@ -9,6 +9,8 @@ #include <AK/CharacterTypes.h> #include <AK/Debug.h> #include <AK/HashMap.h> +#include <AK/Utf8View.h> +#include <LibUnicode/CharacterTypes.h> #include <stdio.h> namespace JS { @@ -186,6 +188,26 @@ void Lexer::consume() } else { dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again."); } + } else if (is_unicode_character()) { + size_t char_size = 1; + if ((m_current_char & 64) == 0) { + // invalid char + } else if ((m_current_char & 32) == 0) { + char_size = 2; + } else if ((m_current_char & 16) == 0) { + char_size = 3; + } else if ((m_current_char & 8) == 0) { + char_size = 4; + } + + VERIFY(char_size > 1); + --char_size; + + m_position += char_size; + if (did_reach_eof()) + return; + + m_line_column++; } else { m_line_column++; } @@ -310,21 +332,67 @@ bool Lexer::is_line_terminator() const { if (m_current_char == '\n' || m_current_char == '\r') return true; - if (m_position > 0 && m_position + 1 < m_source.length()) { - auto three_chars_view = m_source.substring_view(m_position - 1, 3); - return (three_chars_view == LINE_SEPARATOR) || (three_chars_view == PARAGRAPH_SEPARATOR); - } + if (!is_unicode_character()) + return false; + + auto code_point = current_code_point(); + return code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR; +} + +bool Lexer::is_unicode_character() const +{ + return (m_current_char & 128) != 0; +} + +u32 Lexer::current_code_point() const +{ + static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD; + if (m_position == 0) + return REPLACEMENT_CHARACTER; + Utf8View utf_8_view { m_source.substring_view(m_position - 1) }; + return *utf_8_view.begin(); +} + +bool Lexer::is_whitespace() const +{ + if (is_ascii_space(m_current_char)) + return true; + if (!is_unicode_character()) + return false; + auto code_point = current_code_point(); + if (code_point == NO_BREAK_SPACE) + return true; + + static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv); + if (space_separator_category.has_value()) + return Unicode::code_point_has_general_category(code_point, *space_separator_category); return false; } bool Lexer::is_identifier_start() const { - return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$'; + if (!is_unicode_character()) + return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$'; + auto code_point = current_code_point(); + + static auto id_start_category = Unicode::property_from_string("ID_Start"sv); + if (id_start_category.has_value()) + return Unicode::code_point_has_property(code_point, *id_start_category); + return false; } bool Lexer::is_identifier_middle() const { - return is_identifier_start() || is_ascii_digit(m_current_char); + if (!is_unicode_character()) + return is_identifier_start() || is_ascii_digit(m_current_char); + auto code_point = current_code_point(); + if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER) + return true; + + static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv); + if (id_continue_category.has_value()) + return Unicode::code_point_has_property(code_point, *id_continue_category); + return false; } bool Lexer::is_line_comment_start(bool line_has_token_yet) const @@ -390,10 +458,10 @@ Token Lexer::next() do { consume(); } while (is_line_terminator()); - } else if (is_ascii_space(m_current_char)) { + } else if (is_whitespace()) { do { consume(); - } while (is_ascii_space(m_current_char)); + } while (is_whitespace()); } else if (is_line_comment_start(line_has_token_yet)) { consume(); do { diff --git a/Userland/Libraries/LibJS/Lexer.h b/Userland/Libraries/LibJS/Lexer.h index 3117ee15a5..f4b9af4b20 100644 --- a/Userland/Libraries/LibJS/Lexer.h +++ b/Userland/Libraries/LibJS/Lexer.h @@ -34,8 +34,13 @@ private: bool consume_hexadecimal_number(); bool consume_binary_number(); bool consume_decimal_number(); + + bool is_unicode_character() const; + u32 current_code_point() const; + bool is_eof() const; bool is_line_terminator() const; + bool is_whitespace() const; bool is_identifier_start() const; bool is_identifier_middle() const; bool is_line_comment_start(bool line_has_token_yet) const; diff --git a/Userland/Libraries/LibJS/Parser.h b/Userland/Libraries/LibJS/Parser.h index eaf9df0a94..6597072736 100644 --- a/Userland/Libraries/LibJS/Parser.h +++ b/Userland/Libraries/LibJS/Parser.h @@ -120,8 +120,8 @@ public: String source_string { source }; source_string.replace("\r\n", "\n"); source_string.replace("\r", "\n"); - source_string.replace(LINE_SEPARATOR, "\n"); - source_string.replace(PARAGRAPH_SEPARATOR, "\n"); + source_string.replace(LINE_SEPARATOR_STRING, "\n"); + source_string.replace(PARAGRAPH_SEPARATOR_STRING, "\n"); StringBuilder builder; builder.append(source_string.split_view('\n', true)[position.value().line - 1]); builder.append('\n'); diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp index ca3f4593b5..1b5edc8a31 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -85,8 +85,8 @@ static String escape_regexp_pattern(const RegExpObject& regexp_object) // FIXME: Check u flag and escape accordingly pattern.replace("\n", "\\n", true); pattern.replace("\r", "\\r", true); - pattern.replace(LINE_SEPARATOR, "\\u2028", true); - pattern.replace(PARAGRAPH_SEPARATOR, "\\u2029", true); + pattern.replace(LINE_SEPARATOR_STRING, "\\u2028", true); + pattern.replace(PARAGRAPH_SEPARATOR_STRING, "\\u2029", true); pattern.replace("/", "\\/", true); return pattern; } diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp index ffa611e80b..0944dbd04f 100644 --- a/Userland/Libraries/LibJS/Token.cpp +++ b/Userland/Libraries/LibJS/Token.cpp @@ -130,7 +130,7 @@ String Token::string_value(StringValueStatus& status) const continue; } // Line continuation - if (lexer.next_is(LINE_SEPARATOR) || lexer.next_is(PARAGRAPH_SEPARATOR)) { + if (lexer.next_is(LINE_SEPARATOR_STRING) || lexer.next_is(PARAGRAPH_SEPARATOR_STRING)) { lexer.ignore(3); continue; } @@ -281,7 +281,7 @@ bool Token::is_identifier_name() const bool Token::trivia_contains_line_terminator() const { - return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR) || m_trivia.contains(PARAGRAPH_SEPARATOR); + return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR_STRING) || m_trivia.contains(PARAGRAPH_SEPARATOR_STRING); } } diff --git a/Userland/Libraries/LibJS/Token.h b/Userland/Libraries/LibJS/Token.h index 1a9033c8f1..f5755589bf 100644 --- a/Userland/Libraries/LibJS/Token.h +++ b/Userland/Libraries/LibJS/Token.h @@ -13,11 +13,22 @@ namespace JS { // U+2028 LINE SEPARATOR constexpr const char line_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa8, 0 }; -constexpr const StringView LINE_SEPARATOR { line_separator_chars }; +constexpr const StringView LINE_SEPARATOR_STRING { line_separator_chars }; +constexpr const u32 LINE_SEPARATOR { 0x2028 }; // U+2029 PARAGRAPH SEPARATOR constexpr const char paragraph_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa9, 0 }; -constexpr const StringView PARAGRAPH_SEPARATOR { paragraph_separator_chars }; +constexpr const StringView PARAGRAPH_SEPARATOR_STRING { paragraph_separator_chars }; +constexpr const u32 PARAGRAPH_SEPARATOR { 0x2029 }; + +// U+00A0 NO BREAK SPACE +constexpr const u32 NO_BREAK_SPACE { 0x00A0 }; + +// U+200C ZERO WIDTH NON-JOINER +constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C }; + +// U+200D ZERO WIDTH JOINER +constexpr const u32 ZERO_WIDTH_JOINER { 0x200D }; #define ENUMERATE_JS_TOKENS \ __ENUMERATE_JS_TOKEN(Ampersand, Operator) \ |