LibJS: Use GenericLexer to consume escaped code points

author: Timothy Flynn <trflynn89@pm.me> 2021-08-17 22:24:17 -0400
committer: Andreas Kling <kling@serenityos.org> 2021-08-19 23:49:25 +0200
commit: dd44a5e9488eb0758d8bf29f112d01b6837f7215 (patch)
tree: a29792be80f9342d447f8734717ac22a8fb91674 /Userland
parent: fd8ccedf2b21a49571100b5a56d9a1f5d28b834c (diff)
download: serenity-dd44a5e9488eb0758d8bf29f112d01b6837f7215.zip
1 files changed, 17 insertions, 52 deletions
diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp
index 0944dbd04f..5e918b6557 100644
--- a/Userland/Libraries/LibJS/Token.cpp
+++ b/Userland/Libraries/LibJS/Token.cpp
@@ -10,7 +10,6 @@
 #include <AK/CharacterTypes.h>
 #include <AK/GenericLexer.h>
 #include <AK/StringBuilder.h>
-#include <AK/Utf16View.h>
 
 namespace JS {
 
@@ -103,16 +102,6 @@ String Token::string_value(StringValueStatus& status) const
         return {};
     };
 
-    auto decode_surrogate = [&lexer]() -> Optional<u16> {
-        u16 surrogate = 0;
-        for (int j = 0; j < 4; ++j) {
-            if (!lexer.next_is(is_ascii_hex_digit))
-                return {};
-            surrogate = (surrogate << 4u) | hex2int(lexer.consume());
-        }
-        return surrogate;
-    };
-
     StringBuilder builder;
     while (!lexer.is_eof()) {
         // No escape, consume one char and continue
@@ -121,6 +110,23 @@ String Token::string_value(StringValueStatus& status) const
             continue;
         }
 
+        // Unicode escape
+        if (lexer.next_is("\\u"sv)) {
+            auto code_point_or_error = lexer.consume_escaped_code_point();
+
+            if (code_point_or_error.is_error()) {
+                switch (code_point_or_error.error()) {
+                case GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape:
+                    return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
+                case GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow:
+                    return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
+                }
+            }
+
+            builder.append_code_point(code_point_or_error.value());
+            continue;
+        }
+
         lexer.ignore();
         VERIFY(!lexer.is_eof());
 
@@ -150,47 +156,6 @@ String Token::string_value(StringValueStatus& status) const
             builder.append_code_point(code_point);
             continue;
         }
-        // Unicode escape
-        if (lexer.next_is('u')) {
-            lexer.ignore();
-            u32 code_point = 0;
-            if (lexer.next_is('{')) {
-                lexer.ignore();
-                while (true) {
-                    if (!lexer.next_is(is_ascii_hex_digit))
-                        return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
-                    auto new_code_point = (code_point << 4u) | hex2int(lexer.consume());
-                    if (new_code_point < code_point)
-                        return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
-                    code_point = new_code_point;
-                    if (lexer.next_is('}'))
-                        break;
-                }
-                lexer.ignore();
-            } else {
-                auto high_surrogate = decode_surrogate();
-                if (!high_surrogate.has_value())
-                    return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
-
-                if (Utf16View::is_high_surrogate(*high_surrogate) && lexer.consume_specific("\\u"sv)) {
-                    auto low_surrogate = decode_surrogate();
-                    if (!low_surrogate.has_value())
-                        return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
-
-                    if (Utf16View::is_low_surrogate(*low_surrogate)) {
-                        code_point = Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
-                    } else {
-                        builder.append_code_point(*high_surrogate);
-                        code_point = *low_surrogate;
-                    }
-
-                } else {
-                    code_point = *high_surrogate;
-                }
-            }
-            builder.append_code_point(code_point);
-            continue;
-        }
 
         // In non-strict mode LegacyOctalEscapeSequence is allowed in strings:
         // https://tc39.es/ecma262/#sec-additional-syntax-string-literals
author	Timothy Flynn <trflynn89@pm.me>	2021-08-17 22:24:17 -0400
committer	Andreas Kling <kling@serenityos.org>	2021-08-19 23:49:25 +0200
commit	dd44a5e9488eb0758d8bf29f112d01b6837f7215 (patch)
tree	a29792be80f9342d447f8734717ac22a8fb91674 /Userland
parent	fd8ccedf2b21a49571100b5a56d9a1f5d28b834c (diff)
download	serenity-dd44a5e9488eb0758d8bf29f112d01b6837f7215.zip