LibJS: Implement UTF-16 surrogate pair concatenation without iteration

Performance of string concatenation regressed in a57e2f9. That commit iterates over the LHS string to find the last code unit, to check if it is a high surrogate. Instead, first look at the 3rd-to-last byte in the UTF-8 encoded string to check if it is a 3-byte code point; then decode just those bytes to check if we have a high surrogate. Similarly, check the first 3 bytes of the RHS string to check if we have a low surrogate.
author: Timothy Flynn <trflynn89@pm.me> 2022-01-17 22:02:50 -0500
committer: Linus Groh <mail@linusgroh.de> 2022-01-18 09:46:55 +0000
commit: 444b2d9ec2e962235e2ea7c15c8457c8627d7ca2 (patch)
tree: a795128102e45324dda93ac11f5beb2247af51f3 /Userland/Libraries
parent: 567b3a481076083a6a5ad0e443bbaca71e396e44 (diff)
download: serenity-444b2d9ec2e962235e2ea7c15c8457c8627d7ca2.zip
1 files changed, 27 insertions, 20 deletions
diff --git a/Userland/Libraries/LibJS/Runtime/Value.cpp b/Userland/Libraries/LibJS/Runtime/Value.cpp
index 61bccf04cc..4612614e84 100644
--- a/Userland/Libraries/LibJS/Runtime/Value.cpp
+++ b/Userland/Libraries/LibJS/Runtime/Value.cpp
@@ -995,31 +995,38 @@ static PrimitiveString* concatenate_strings(GlobalObject& global_object, Primiti
         return js_string(vm, Utf16String(move(combined)));
     }
 
-    Utf8View lhs_string { lhs.string() };
-    Utf8View rhs_string { rhs.string() };
-
+    auto const& lhs_string = lhs.string();
+    auto const& rhs_string = rhs.string();
     StringBuilder builder(lhs_string.length() + rhs_string.length());
-    Optional<u16> high_surrogate;
 
-    for (auto it = lhs_string.begin(); it != lhs_string.end(); ++it) {
-        if (!it.peek(1).has_value() && Utf16View::is_high_surrogate(*it) && !rhs_string.is_empty())
-            high_surrogate = *it;
-        else
-            builder.append_code_point(*it);
-    }
+    auto return_combined_strings = [&]() {
+        builder.append(lhs_string);
+        builder.append(rhs_string);
+        return js_string(vm, builder.to_string());
+    };
 
-    if (high_surrogate.has_value()) {
-        auto low_surrogate = *rhs_string.begin();
+    // Surrogates encoded as UTF-8 are 3 bytes.
+    if ((lhs_string.length() < 3) || (rhs_string.length() < 3))
+        return return_combined_strings();
 
-        if (Utf16View::is_low_surrogate(low_surrogate)) {
-            builder.append_code_point(Utf16View::decode_surrogate_pair(*high_surrogate, low_surrogate));
-            rhs_string = rhs_string.substring_view(3); // A low surrogate encoded as UTF-8 is 3 bytes.
-        } else {
-            builder.append_code_point(*high_surrogate);
-        }
-    }
+    auto lhs_leading_byte = static_cast<u8>(lhs_string[lhs_string.length() - 3]);
+    auto rhs_leading_byte = static_cast<u8>(rhs_string[0]);
+
+    if ((lhs_leading_byte & 0xf0) != 0xe0)
+        return return_combined_strings();
+    if ((rhs_leading_byte & 0xf0) != 0xe0)
+        return return_combined_strings();
+
+    auto high_surrogate = *Utf8View(lhs_string.substring_view(lhs_string.length() - 3)).begin();
+    auto low_surrogate = *Utf8View(rhs_string).begin();
+
+    if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate))
+        return return_combined_strings();
+
+    builder.append(lhs_string.substring_view(0, lhs_string.length() - 3));
+    builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
+    builder.append(rhs_string.substring_view(3));
 
-    builder.append(rhs_string.as_string());
     return js_string(vm, builder.to_string());
 }
author	Timothy Flynn <trflynn89@pm.me>	2022-01-17 22:02:50 -0500
committer	Linus Groh <mail@linusgroh.de>	2022-01-18 09:46:55 +0000
commit	444b2d9ec2e962235e2ea7c15c8457c8627d7ca2 (patch)
tree	a795128102e45324dda93ac11f5beb2247af51f3 /Userland/Libraries
parent	567b3a481076083a6a5ad0e443bbaca71e396e44 (diff)
download	serenity-444b2d9ec2e962235e2ea7c15c8457c8627d7ca2.zip