diff options
author | Timothy Flynn <trflynn89@pm.me> | 2022-01-17 22:02:50 -0500 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2022-01-18 09:46:55 +0000 |
commit | 444b2d9ec2e962235e2ea7c15c8457c8627d7ca2 (patch) | |
tree | a795128102e45324dda93ac11f5beb2247af51f3 /Userland/Libraries | |
parent | 567b3a481076083a6a5ad0e443bbaca71e396e44 (diff) | |
download | serenity-444b2d9ec2e962235e2ea7c15c8457c8627d7ca2.zip |
LibJS: Implement UTF-16 surrogate pair concatenation without iteration
Performance of string concatenation regressed in a57e2f9. That commit
iterates over the LHS string to find the last code unit, to check if it
is a high surrogate. Instead, first look at the 3rd-to-last byte in the
UTF-8 encoded string to check if it is a 3-byte code point; then decode
just those bytes to check if we have a high surrogate. Similarly, check
the first 3 bytes of the RHS string to check if we have a low surrogate.
Diffstat (limited to 'Userland/Libraries')
-rw-r--r-- | Userland/Libraries/LibJS/Runtime/Value.cpp | 47 |
1 files changed, 27 insertions, 20 deletions
diff --git a/Userland/Libraries/LibJS/Runtime/Value.cpp b/Userland/Libraries/LibJS/Runtime/Value.cpp index 61bccf04cc..4612614e84 100644 --- a/Userland/Libraries/LibJS/Runtime/Value.cpp +++ b/Userland/Libraries/LibJS/Runtime/Value.cpp @@ -995,31 +995,38 @@ static PrimitiveString* concatenate_strings(GlobalObject& global_object, Primiti return js_string(vm, Utf16String(move(combined))); } - Utf8View lhs_string { lhs.string() }; - Utf8View rhs_string { rhs.string() }; - + auto const& lhs_string = lhs.string(); + auto const& rhs_string = rhs.string(); StringBuilder builder(lhs_string.length() + rhs_string.length()); - Optional<u16> high_surrogate; - for (auto it = lhs_string.begin(); it != lhs_string.end(); ++it) { - if (!it.peek(1).has_value() && Utf16View::is_high_surrogate(*it) && !rhs_string.is_empty()) - high_surrogate = *it; - else - builder.append_code_point(*it); - } + auto return_combined_strings = [&]() { + builder.append(lhs_string); + builder.append(rhs_string); + return js_string(vm, builder.to_string()); + }; - if (high_surrogate.has_value()) { - auto low_surrogate = *rhs_string.begin(); + // Surrogates encoded as UTF-8 are 3 bytes. + if ((lhs_string.length() < 3) || (rhs_string.length() < 3)) + return return_combined_strings(); - if (Utf16View::is_low_surrogate(low_surrogate)) { - builder.append_code_point(Utf16View::decode_surrogate_pair(*high_surrogate, low_surrogate)); - rhs_string = rhs_string.substring_view(3); // A low surrogate encoded as UTF-8 is 3 bytes. - } else { - builder.append_code_point(*high_surrogate); - } - } + auto lhs_leading_byte = static_cast<u8>(lhs_string[lhs_string.length() - 3]); + auto rhs_leading_byte = static_cast<u8>(rhs_string[0]); + + if ((lhs_leading_byte & 0xf0) != 0xe0) + return return_combined_strings(); + if ((rhs_leading_byte & 0xf0) != 0xe0) + return return_combined_strings(); + + auto high_surrogate = *Utf8View(lhs_string.substring_view(lhs_string.length() - 3)).begin(); + auto low_surrogate = *Utf8View(rhs_string).begin(); + + if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) + return return_combined_strings(); + + builder.append(lhs_string.substring_view(0, lhs_string.length() - 3)); + builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate)); + builder.append(rhs_string.substring_view(3)); - builder.append(rhs_string.as_string()); return js_string(vm, builder.to_string()); } |