diff options
author | Andreas Kling <kling@serenityos.org> | 2022-08-05 23:58:47 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2022-08-06 00:29:15 +0200 |
commit | 64b29eb4596976138b2e4f67f8502419e584f1e4 (patch) | |
tree | e16a0965e0613b3247e64cb363e654847ef6a20e /Userland/Libraries/LibJS/Runtime | |
parent | cf62d08b2ad8180249f68cd71a16c437706d000c (diff) | |
download | serenity-64b29eb4596976138b2e4f67f8502419e584f1e4.zip |
LibJS: Implement string concatenation using ropes
Instead of concatenating string data every time you add two strings
together in JavaScript, we now create a new PrimitiveString that points
to the two concatenated strings instead.
This turns concatenated strings into a tree structure that doesn't have
to be serialized until someone wants the characters in the string.
This *dramatically* reduces the peak memory footprint when running
the SunSpider benchmark (from ~6G to ~1G on my machine). It's also
significantly faster (1.39x) :^)
Diffstat (limited to 'Userland/Libraries/LibJS/Runtime')
-rw-r--r-- | Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp | 162 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Runtime/PrimitiveString.h | 15 | ||||
-rw-r--r-- | Userland/Libraries/LibJS/Runtime/Value.cpp | 54 |
3 files changed, 172 insertions, 59 deletions
diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp index 5aa4f0e17f..5b20e6f341 100644 --- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp +++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp @@ -15,15 +15,22 @@ namespace JS { +PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs) +{ + m_is_rope = true; + m_left = &lhs; + m_right = &rhs; +} + PrimitiveString::PrimitiveString(String string) - : m_utf8_string(move(string)) - , m_has_utf8_string(true) + : m_has_utf8_string(true) + , m_utf8_string(move(string)) { } PrimitiveString::PrimitiveString(Utf16String string) - : m_utf16_string(move(string)) - , m_has_utf16_string(true) + : m_has_utf16_string(true) + , m_utf16_string(move(string)) { } @@ -32,8 +39,22 @@ PrimitiveString::~PrimitiveString() vm().string_cache().remove(m_utf8_string); } +void PrimitiveString::visit_edges(Cell::Visitor& visitor) +{ + Cell::visit_edges(visitor); + if (m_is_rope) { + visitor.visit(m_left); + visitor.visit(m_right); + } +} + bool PrimitiveString::is_empty() const { + if (m_is_rope) { + // NOTE: We never make an empty rope string. + return false; + } + if (m_has_utf16_string) return m_utf16_string.is_empty(); if (m_has_utf8_string) @@ -43,6 +64,7 @@ bool PrimitiveString::is_empty() const String const& PrimitiveString::string() const { + resolve_rope_if_needed(); if (!m_has_utf8_string) { m_utf8_string = m_utf16_string.to_utf8(); m_has_utf8_string = true; @@ -52,6 +74,7 @@ String const& PrimitiveString::string() const Utf16String const& PrimitiveString::utf16_string() const { + resolve_rope_if_needed(); if (!m_has_utf16_string) { m_utf16_string = Utf16String(m_utf8_string); m_has_utf16_string = true; @@ -139,4 +162,135 @@ PrimitiveString* js_string(VM& vm, String string) return js_string(vm.heap(), move(string)); } +PrimitiveString* js_rope_string(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs) +{ + // We're here to concatenate two strings into a new rope string. + // However, if any of them are empty, no rope is required. + + bool lhs_empty = lhs.is_empty(); + bool rhs_empty = rhs.is_empty(); + + if (lhs_empty && rhs_empty) + return &vm.empty_string(); + + if (lhs_empty) + return &rhs; + + if (rhs_empty) + return &lhs; + + return vm.heap().allocate_without_global_object<PrimitiveString>(lhs, rhs); +} + +void PrimitiveString::resolve_rope_if_needed() const +{ + if (!m_is_rope) + return; + + // NOTE: Special case for two concatenated UTF-16 strings. + // This is here as an optimization, although I'm unsure how valuable it is. + if (m_left->has_utf16_string() && m_right->has_utf16_string()) { + auto const& lhs_string = m_left->utf16_string(); + auto const& rhs_string = m_right->utf16_string(); + + Vector<u16, 1> combined; + combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units()); + combined.extend(lhs_string.string()); + combined.extend(rhs_string.string()); + + m_utf16_string = Utf16String(move(combined)); + m_has_utf16_string = true; + m_is_rope = false; + m_left = nullptr; + m_right = nullptr; + return; + } + + // This vector will hold all the pieces of the rope that need to be assembled + // into the resolved string. + Vector<PrimitiveString const*> pieces; + + // NOTE: We traverse the rope tree without using recursion, since we'd run out of + // stack space quickly when handling a long sequence of unresolved concatenations. + Vector<PrimitiveString const*> stack; + stack.append(m_right); + stack.append(m_left); + while (!stack.is_empty()) { + auto* current = stack.take_last(); + if (current->m_is_rope) { + stack.append(current->m_right); + stack.append(current->m_left); + continue; + } + pieces.append(current); + } + + // Now that we have all the pieces, we can concatenate them using a StringBuilder. + StringBuilder builder; + + // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces. + PrimitiveString const* previous = nullptr; + for (auto const* current : pieces) { + if (!previous) { + // This is the very first piece, just append it and continue. + builder.append(current->string()); + previous = current; + continue; + } + + // Get the UTF-8 representations for both strings. + auto const& previous_string_as_utf8 = previous->string(); + auto const& current_string_as_utf8 = current->string(); + + // NOTE: Now we need to look at the end of the previous string and the start + // of the current string, to see if they should be combined into a surrogate. + + // Surrogates encoded as UTF-8 are 3 bytes. + if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) { + builder.append(current->string()); + previous = current; + continue; + } + + // Might the previous string end with a UTF-8 encoded surrogate? + if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) { + // If not, just append the current string and continue. + builder.append(current->string()); + previous = current; + continue; + } + + // Might the current string begin with a UTF-8 encoded surrogate? + if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) { + // If not, just append the current string and continue. + builder.append(current->string()); + previous = current; + continue; + } + + auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin(); + auto low_surrogate = *Utf8View(current_string_as_utf8).begin(); + + if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) { + builder.append(current->string()); + previous = current; + continue; + } + + // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point. + builder.trim(3); + builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate)); + + // Append the remaining part of the current string. + builder.append(current_string_as_utf8.substring_view(3)); + previous = current; + } + + m_utf8_string = builder.to_string(); + m_has_utf8_string = true; + m_is_rope = false; + m_left = nullptr; + m_right = nullptr; +} + } diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h index 1a4f4af0dc..41b79828c0 100644 --- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h +++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h @@ -17,6 +17,7 @@ namespace JS { class PrimitiveString final : public Cell { public: + explicit PrimitiveString(PrimitiveString&, PrimitiveString&); explicit PrimitiveString(String); explicit PrimitiveString(Utf16String); virtual ~PrimitiveString(); @@ -37,12 +38,20 @@ public: private: virtual StringView class_name() const override { return "PrimitiveString"sv; } + virtual void visit_edges(Cell::Visitor&) override; - mutable String m_utf8_string; + void resolve_rope_if_needed() const; + + mutable bool m_is_rope { false }; mutable bool m_has_utf8_string { false }; + mutable bool m_has_utf16_string { false }; + + mutable PrimitiveString* m_left { nullptr }; + mutable PrimitiveString* m_right { nullptr }; + + mutable String m_utf8_string; mutable Utf16String m_utf16_string; - mutable bool m_has_utf16_string { false }; }; PrimitiveString* js_string(Heap&, Utf16View const&); @@ -54,4 +63,6 @@ PrimitiveString* js_string(VM&, Utf16String); PrimitiveString* js_string(Heap&, String); PrimitiveString* js_string(VM&, String); +PrimitiveString* js_rope_string(VM&, PrimitiveString&, PrimitiveString&); + } diff --git a/Userland/Libraries/LibJS/Runtime/Value.cpp b/Userland/Libraries/LibJS/Runtime/Value.cpp index 34fb6c1395..f210760c6a 100644 --- a/Userland/Libraries/LibJS/Runtime/Value.cpp +++ b/Userland/Libraries/LibJS/Runtime/Value.cpp @@ -1074,58 +1074,6 @@ ThrowCompletionOr<Value> unsigned_right_shift(GlobalObject& global_object, Value return vm.throw_completion<TypeError>(global_object, ErrorType::BigIntBadOperator, "unsigned right-shift"); } -// https://tc39.es/ecma262/#string-concatenation -static PrimitiveString* concatenate_strings(GlobalObject& global_object, PrimitiveString const& lhs, PrimitiveString const& rhs) -{ - auto& vm = global_object.vm(); - - if (lhs.has_utf16_string() && rhs.has_utf16_string()) { - auto const& lhs_string = lhs.utf16_string(); - auto const& rhs_string = rhs.utf16_string(); - - Vector<u16, 1> combined; - combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units()); - combined.extend(lhs_string.string()); - combined.extend(rhs_string.string()); - - return js_string(vm, Utf16String(move(combined))); - } - - auto const& lhs_string = lhs.string(); - auto const& rhs_string = rhs.string(); - StringBuilder builder(lhs_string.length() + rhs_string.length()); - - auto return_combined_strings = [&]() { - builder.append(lhs_string); - builder.append(rhs_string); - return js_string(vm, builder.to_string()); - }; - - // Surrogates encoded as UTF-8 are 3 bytes. - if ((lhs_string.length() < 3) || (rhs_string.length() < 3)) - return return_combined_strings(); - - auto lhs_leading_byte = static_cast<u8>(lhs_string[lhs_string.length() - 3]); - auto rhs_leading_byte = static_cast<u8>(rhs_string[0]); - - if ((lhs_leading_byte & 0xf0) != 0xe0) - return return_combined_strings(); - if ((rhs_leading_byte & 0xf0) != 0xe0) - return return_combined_strings(); - - auto high_surrogate = *Utf8View(lhs_string.substring_view(lhs_string.length() - 3)).begin(); - auto low_surrogate = *Utf8View(rhs_string).begin(); - - if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) - return return_combined_strings(); - - builder.append(lhs_string.substring_view(0, lhs_string.length() - 3)); - builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate)); - builder.append(rhs_string.substring_view(3)); - - return js_string(vm, builder.to_string()); -} - // 13.8.1 The Addition Operator ( + ), https://tc39.es/ecma262/#sec-addition-operator-plus ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs) { @@ -1146,7 +1094,7 @@ ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs) if (lhs_primitive.is_string() || rhs_primitive.is_string()) { auto lhs_string = TRY(lhs_primitive.to_primitive_string(global_object)); auto rhs_string = TRY(rhs_primitive.to_primitive_string(global_object)); - return concatenate_strings(global_object, *lhs_string, *rhs_string); + return js_rope_string(global_object.vm(), *lhs_string, *rhs_string); } auto lhs_numeric = TRY(lhs_primitive.to_numeric(global_object)); |