LibJS: Correctly handle Unicode characters in JS source text

Also recognize additional white space characters.
author: davidot <david.tuin@gmail.com> 2021-08-14 17:07:47 +0200
committer: Linus Groh <mail@linusgroh.de> 2021-08-16 23:20:04 +0100
commit: 47bc72bcf6f2fae3dc0938b96d8c7890b1005ead (patch)
tree: 61e9fe09d1cb3a012616e7744e15c6d7491c4ee6
parent: 4d6502de423bdf7a278d67fa2ebb515fd834ce41 (diff)
download: serenity-47bc72bcf6f2fae3dc0938b96d8c7890b1005ead.zip
6 files changed, 100 insertions, 16 deletions
diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp
index 9ae60a7e77..e50e0a3956 100644
--- a/Userland/Libraries/LibJS/Lexer.cpp
+++ b/Userland/Libraries/LibJS/Lexer.cpp
@@ -9,6 +9,8 @@
 #include <AK/CharacterTypes.h>
 #include <AK/Debug.h>
 #include <AK/HashMap.h>
+#include <AK/Utf8View.h>
+#include <LibUnicode/CharacterTypes.h>
 #include <stdio.h>
 
 namespace JS {
@@ -186,6 +188,26 @@ void Lexer::consume()
         } else {
             dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
         }
+    } else if (is_unicode_character()) {
+        size_t char_size = 1;
+        if ((m_current_char & 64) == 0) {
+            // invalid char
+        } else if ((m_current_char & 32) == 0) {
+            char_size = 2;
+        } else if ((m_current_char & 16) == 0) {
+            char_size = 3;
+        } else if ((m_current_char & 8) == 0) {
+            char_size = 4;
+        }
+
+        VERIFY(char_size > 1);
+        --char_size;
+
+        m_position += char_size;
+        if (did_reach_eof())
+            return;
+
+        m_line_column++;
     } else {
         m_line_column++;
     }
@@ -310,21 +332,67 @@ bool Lexer::is_line_terminator() const
 {
     if (m_current_char == '\n' || m_current_char == '\r')
         return true;
-    if (m_position > 0 && m_position + 1 < m_source.length()) {
-        auto three_chars_view = m_source.substring_view(m_position - 1, 3);
-        return (three_chars_view == LINE_SEPARATOR) || (three_chars_view == PARAGRAPH_SEPARATOR);
-    }
+    if (!is_unicode_character())
+        return false;
+
+    auto code_point = current_code_point();
+    return code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR;
+}
+
+bool Lexer::is_unicode_character() const
+{
+    return (m_current_char & 128) != 0;
+}
+
+u32 Lexer::current_code_point() const
+{
+    static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
+    if (m_position == 0)
+        return REPLACEMENT_CHARACTER;
+    Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
+    return *utf_8_view.begin();
+}
+
+bool Lexer::is_whitespace() const
+{
+    if (is_ascii_space(m_current_char))
+        return true;
+    if (!is_unicode_character())
+        return false;
+    auto code_point = current_code_point();
+    if (code_point == NO_BREAK_SPACE)
+        return true;
+
+    static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv);
+    if (space_separator_category.has_value())
+        return Unicode::code_point_has_general_category(code_point, *space_separator_category);
     return false;
 }
 
 bool Lexer::is_identifier_start() const
 {
-    return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
+    if (!is_unicode_character())
+        return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
+    auto code_point = current_code_point();
+
+    static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
+    if (id_start_category.has_value())
+        return Unicode::code_point_has_property(code_point, *id_start_category);
+    return false;
 }
 
 bool Lexer::is_identifier_middle() const
 {
-    return is_identifier_start() || is_ascii_digit(m_current_char);
+    if (!is_unicode_character())
+        return is_identifier_start() || is_ascii_digit(m_current_char);
+    auto code_point = current_code_point();
+    if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
+        return true;
+
+    static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
+    if (id_continue_category.has_value())
+        return Unicode::code_point_has_property(code_point, *id_continue_category);
+    return false;
 }
 
 bool Lexer::is_line_comment_start(bool line_has_token_yet) const
@@ -390,10 +458,10 @@ Token Lexer::next()
                 do {
                     consume();
                 } while (is_line_terminator());
-            } else if (is_ascii_space(m_current_char)) {
+            } else if (is_whitespace()) {
                 do {
                     consume();
-                } while (is_ascii_space(m_current_char));
+                } while (is_whitespace());
             } else if (is_line_comment_start(line_has_token_yet)) {
                 consume();
                 do {
diff --git a/Userland/Libraries/LibJS/Lexer.h b/Userland/Libraries/LibJS/Lexer.h
index 3117ee15a5..f4b9af4b20 100644
--- a/Userland/Libraries/LibJS/Lexer.h
+++ b/Userland/Libraries/LibJS/Lexer.h
@@ -34,8 +34,13 @@ private:
     bool consume_hexadecimal_number();
     bool consume_binary_number();
     bool consume_decimal_number();
+
+    bool is_unicode_character() const;
+    u32 current_code_point() const;
+
     bool is_eof() const;
     bool is_line_terminator() const;
+    bool is_whitespace() const;
     bool is_identifier_start() const;
     bool is_identifier_middle() const;
     bool is_line_comment_start(bool line_has_token_yet) const;
diff --git a/Userland/Libraries/LibJS/Parser.h b/Userland/Libraries/LibJS/Parser.h
index eaf9df0a94..6597072736 100644
--- a/Userland/Libraries/LibJS/Parser.h
+++ b/Userland/Libraries/LibJS/Parser.h
@@ -120,8 +120,8 @@ public:
             String source_string { source };
             source_string.replace("\r\n", "\n");
             source_string.replace("\r", "\n");
-            source_string.replace(LINE_SEPARATOR, "\n");
-            source_string.replace(PARAGRAPH_SEPARATOR, "\n");
+            source_string.replace(LINE_SEPARATOR_STRING, "\n");
+            source_string.replace(PARAGRAPH_SEPARATOR_STRING, "\n");
             StringBuilder builder;
             builder.append(source_string.split_view('\n', true)[position.value().line - 1]);
             builder.append('\n');
diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp
index ca3f4593b5..1b5edc8a31 100644
--- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp
+++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp
@@ -85,8 +85,8 @@ static String escape_regexp_pattern(const RegExpObject& regexp_object)
     // FIXME: Check u flag and escape accordingly
     pattern.replace("\n", "\\n", true);
     pattern.replace("\r", "\\r", true);
-    pattern.replace(LINE_SEPARATOR, "\\u2028", true);
-    pattern.replace(PARAGRAPH_SEPARATOR, "\\u2029", true);
+    pattern.replace(LINE_SEPARATOR_STRING, "\\u2028", true);
+    pattern.replace(PARAGRAPH_SEPARATOR_STRING, "\\u2029", true);
     pattern.replace("/", "\\/", true);
     return pattern;
 }
diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp
index ffa611e80b..0944dbd04f 100644
--- a/Userland/Libraries/LibJS/Token.cpp
+++ b/Userland/Libraries/LibJS/Token.cpp
@@ -130,7 +130,7 @@ String Token::string_value(StringValueStatus& status) const
             continue;
         }
         // Line continuation
-        if (lexer.next_is(LINE_SEPARATOR) || lexer.next_is(PARAGRAPH_SEPARATOR)) {
+        if (lexer.next_is(LINE_SEPARATOR_STRING) || lexer.next_is(PARAGRAPH_SEPARATOR_STRING)) {
             lexer.ignore(3);
             continue;
         }
@@ -281,7 +281,7 @@ bool Token::is_identifier_name() const
 
 bool Token::trivia_contains_line_terminator() const
 {
-    return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR) || m_trivia.contains(PARAGRAPH_SEPARATOR);
+    return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR_STRING) || m_trivia.contains(PARAGRAPH_SEPARATOR_STRING);
 }
 
 }
diff --git a/Userland/Libraries/LibJS/Token.h b/Userland/Libraries/LibJS/Token.h
index 1a9033c8f1..f5755589bf 100644
--- a/Userland/Libraries/LibJS/Token.h
+++ b/Userland/Libraries/LibJS/Token.h
@@ -13,11 +13,22 @@ namespace JS {
 
 // U+2028 LINE SEPARATOR
 constexpr const char line_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa8, 0 };
-constexpr const StringView LINE_SEPARATOR { line_separator_chars };
+constexpr const StringView LINE_SEPARATOR_STRING { line_separator_chars };
+constexpr const u32 LINE_SEPARATOR { 0x2028 };
 
 // U+2029 PARAGRAPH SEPARATOR
 constexpr const char paragraph_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa9, 0 };
-constexpr const StringView PARAGRAPH_SEPARATOR { paragraph_separator_chars };
+constexpr const StringView PARAGRAPH_SEPARATOR_STRING { paragraph_separator_chars };
+constexpr const u32 PARAGRAPH_SEPARATOR { 0x2029 };
+
+// U+00A0 NO BREAK SPACE
+constexpr const u32 NO_BREAK_SPACE { 0x00A0 };
+
+// U+200C ZERO WIDTH NON-JOINER
+constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C };
+
+// U+200D ZERO WIDTH JOINER
+constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
 
 #define ENUMERATE_JS_TOKENS                                     \
     __ENUMERATE_JS_TOKEN(Ampersand, Operator)                   \
author	davidot <david.tuin@gmail.com>	2021-08-14 17:07:47 +0200
committer	Linus Groh <mail@linusgroh.de>	2021-08-16 23:20:04 +0100
commit	47bc72bcf6f2fae3dc0938b96d8c7890b1005ead (patch)
tree	61e9fe09d1cb3a012616e7744e15c6d7491c4ee6
parent	4d6502de423bdf7a278d67fa2ebb515fd834ce41 (diff)
download	serenity-47bc72bcf6f2fae3dc0938b96d8c7890b1005ead.zip