LibJS: Implement string concatenation using ropes

Instead of concatenating string data every time you add two strings together in JavaScript, we now create a new PrimitiveString that points to the two concatenated strings instead. This turns concatenated strings into a tree structure that doesn't have to be serialized until someone wants the characters in the string. This *dramatically* reduces the peak memory footprint when running the SunSpider benchmark (from ~6G to ~1G on my machine). It's also significantly faster (1.39x) :^)
author: Andreas Kling <kling@serenityos.org> 2022-08-05 23:58:47 +0200
committer: Andreas Kling <kling@serenityos.org> 2022-08-06 00:29:15 +0200
commit: 64b29eb4596976138b2e4f67f8502419e584f1e4 (patch)
tree: e16a0965e0613b3247e64cb363e654847ef6a20e /Userland/Libraries/LibJS/Runtime
parent: cf62d08b2ad8180249f68cd71a16c437706d000c (diff)
download: serenity-64b29eb4596976138b2e4f67f8502419e584f1e4.zip
3 files changed, 172 insertions, 59 deletions
diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
index 5aa4f0e17f..5b20e6f341 100644
--- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
+++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
@@ -15,15 +15,22 @@
 
 namespace JS {
 
+PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
+{
+    m_is_rope = true;
+    m_left = &lhs;
+    m_right = &rhs;
+}
+
 PrimitiveString::PrimitiveString(String string)
-    : m_utf8_string(move(string))
-    , m_has_utf8_string(true)
+    : m_has_utf8_string(true)
+    , m_utf8_string(move(string))
 {
 }
 
 PrimitiveString::PrimitiveString(Utf16String string)
-    : m_utf16_string(move(string))
-    , m_has_utf16_string(true)
+    : m_has_utf16_string(true)
+    , m_utf16_string(move(string))
 {
 }
 
@@ -32,8 +39,22 @@ PrimitiveString::~PrimitiveString()
     vm().string_cache().remove(m_utf8_string);
 }
 
+void PrimitiveString::visit_edges(Cell::Visitor& visitor)
+{
+    Cell::visit_edges(visitor);
+    if (m_is_rope) {
+        visitor.visit(m_left);
+        visitor.visit(m_right);
+    }
+}
+
 bool PrimitiveString::is_empty() const
 {
+    if (m_is_rope) {
+        // NOTE: We never make an empty rope string.
+        return false;
+    }
+
     if (m_has_utf16_string)
         return m_utf16_string.is_empty();
     if (m_has_utf8_string)
@@ -43,6 +64,7 @@ bool PrimitiveString::is_empty() const
 
 String const& PrimitiveString::string() const
 {
+    resolve_rope_if_needed();
     if (!m_has_utf8_string) {
         m_utf8_string = m_utf16_string.to_utf8();
         m_has_utf8_string = true;
@@ -52,6 +74,7 @@ String const& PrimitiveString::string() const
 
 Utf16String const& PrimitiveString::utf16_string() const
 {
+    resolve_rope_if_needed();
     if (!m_has_utf16_string) {
         m_utf16_string = Utf16String(m_utf8_string);
         m_has_utf16_string = true;
@@ -139,4 +162,135 @@ PrimitiveString* js_string(VM& vm, String string)
     return js_string(vm.heap(), move(string));
 }
 
+PrimitiveString* js_rope_string(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
+{
+    // We're here to concatenate two strings into a new rope string.
+    // However, if any of them are empty, no rope is required.
+
+    bool lhs_empty = lhs.is_empty();
+    bool rhs_empty = rhs.is_empty();
+
+    if (lhs_empty && rhs_empty)
+        return &vm.empty_string();
+
+    if (lhs_empty)
+        return &rhs;
+
+    if (rhs_empty)
+        return &lhs;
+
+    return vm.heap().allocate_without_global_object<PrimitiveString>(lhs, rhs);
+}
+
+void PrimitiveString::resolve_rope_if_needed() const
+{
+    if (!m_is_rope)
+        return;
+
+    // NOTE: Special case for two concatenated UTF-16 strings.
+    //       This is here as an optimization, although I'm unsure how valuable it is.
+    if (m_left->has_utf16_string() && m_right->has_utf16_string()) {
+        auto const& lhs_string = m_left->utf16_string();
+        auto const& rhs_string = m_right->utf16_string();
+
+        Vector<u16, 1> combined;
+        combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units());
+        combined.extend(lhs_string.string());
+        combined.extend(rhs_string.string());
+
+        m_utf16_string = Utf16String(move(combined));
+        m_has_utf16_string = true;
+        m_is_rope = false;
+        m_left = nullptr;
+        m_right = nullptr;
+        return;
+    }
+
+    // This vector will hold all the pieces of the rope that need to be assembled
+    // into the resolved string.
+    Vector<PrimitiveString const*> pieces;
+
+    // NOTE: We traverse the rope tree without using recursion, since we'd run out of
+    //       stack space quickly when handling a long sequence of unresolved concatenations.
+    Vector<PrimitiveString const*> stack;
+    stack.append(m_right);
+    stack.append(m_left);
+    while (!stack.is_empty()) {
+        auto* current = stack.take_last();
+        if (current->m_is_rope) {
+            stack.append(current->m_right);
+            stack.append(current->m_left);
+            continue;
+        }
+        pieces.append(current);
+    }
+
+    // Now that we have all the pieces, we can concatenate them using a StringBuilder.
+    StringBuilder builder;
+
+    // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
+    PrimitiveString const* previous = nullptr;
+    for (auto const* current : pieces) {
+        if (!previous) {
+            // This is the very first piece, just append it and continue.
+            builder.append(current->string());
+            previous = current;
+            continue;
+        }
+
+        // Get the UTF-8 representations for both strings.
+        auto const& previous_string_as_utf8 = previous->string();
+        auto const& current_string_as_utf8 = current->string();
+
+        // NOTE: Now we need to look at the end of the previous string and the start
+        //       of the current string, to see if they should be combined into a surrogate.
+
+        // Surrogates encoded as UTF-8 are 3 bytes.
+        if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
+            builder.append(current->string());
+            previous = current;
+            continue;
+        }
+
+        // Might the previous string end with a UTF-8 encoded surrogate?
+        if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
+            // If not, just append the current string and continue.
+            builder.append(current->string());
+            previous = current;
+            continue;
+        }
+
+        // Might the current string begin with a UTF-8 encoded surrogate?
+        if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
+            // If not, just append the current string and continue.
+            builder.append(current->string());
+            previous = current;
+            continue;
+        }
+
+        auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
+        auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
+
+        if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
+            builder.append(current->string());
+            previous = current;
+            continue;
+        }
+
+        // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
+        builder.trim(3);
+        builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
+
+        // Append the remaining part of the current string.
+        builder.append(current_string_as_utf8.substring_view(3));
+        previous = current;
+    }
+
+    m_utf8_string = builder.to_string();
+    m_has_utf8_string = true;
+    m_is_rope = false;
+    m_left = nullptr;
+    m_right = nullptr;
+}
+
 }
diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h
index 1a4f4af0dc..41b79828c0 100644
--- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h
+++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h
@@ -17,6 +17,7 @@ namespace JS {
 
 class PrimitiveString final : public Cell {
 public:
+    explicit PrimitiveString(PrimitiveString&, PrimitiveString&);
     explicit PrimitiveString(String);
     explicit PrimitiveString(Utf16String);
     virtual ~PrimitiveString();
@@ -37,12 +38,20 @@ public:
 
 private:
     virtual StringView class_name() const override { return "PrimitiveString"sv; }
+    virtual void visit_edges(Cell::Visitor&) override;
 
-    mutable String m_utf8_string;
+    void resolve_rope_if_needed() const;
+
+    mutable bool m_is_rope { false };
     mutable bool m_has_utf8_string { false };
+    mutable bool m_has_utf16_string { false };
+
+    mutable PrimitiveString* m_left { nullptr };
+    mutable PrimitiveString* m_right { nullptr };
+
+    mutable String m_utf8_string;
 
     mutable Utf16String m_utf16_string;
-    mutable bool m_has_utf16_string { false };
 };
 
 PrimitiveString* js_string(Heap&, Utf16View const&);
@@ -54,4 +63,6 @@ PrimitiveString* js_string(VM&, Utf16String);
 PrimitiveString* js_string(Heap&, String);
 PrimitiveString* js_string(VM&, String);
 
+PrimitiveString* js_rope_string(VM&, PrimitiveString&, PrimitiveString&);
+
 }
diff --git a/Userland/Libraries/LibJS/Runtime/Value.cpp b/Userland/Libraries/LibJS/Runtime/Value.cpp
index 34fb6c1395..f210760c6a 100644
--- a/Userland/Libraries/LibJS/Runtime/Value.cpp
+++ b/Userland/Libraries/LibJS/Runtime/Value.cpp
@@ -1074,58 +1074,6 @@ ThrowCompletionOr<Value> unsigned_right_shift(GlobalObject& global_object, Value
     return vm.throw_completion<TypeError>(global_object, ErrorType::BigIntBadOperator, "unsigned right-shift");
 }
 
-// https://tc39.es/ecma262/#string-concatenation
-static PrimitiveString* concatenate_strings(GlobalObject& global_object, PrimitiveString const& lhs, PrimitiveString const& rhs)
-{
-    auto& vm = global_object.vm();
-
-    if (lhs.has_utf16_string() && rhs.has_utf16_string()) {
-        auto const& lhs_string = lhs.utf16_string();
-        auto const& rhs_string = rhs.utf16_string();
-
-        Vector<u16, 1> combined;
-        combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units());
-        combined.extend(lhs_string.string());
-        combined.extend(rhs_string.string());
-
-        return js_string(vm, Utf16String(move(combined)));
-    }
-
-    auto const& lhs_string = lhs.string();
-    auto const& rhs_string = rhs.string();
-    StringBuilder builder(lhs_string.length() + rhs_string.length());
-
-    auto return_combined_strings = [&]() {
-        builder.append(lhs_string);
-        builder.append(rhs_string);
-        return js_string(vm, builder.to_string());
-    };
-
-    // Surrogates encoded as UTF-8 are 3 bytes.
-    if ((lhs_string.length() < 3) || (rhs_string.length() < 3))
-        return return_combined_strings();
-
-    auto lhs_leading_byte = static_cast<u8>(lhs_string[lhs_string.length() - 3]);
-    auto rhs_leading_byte = static_cast<u8>(rhs_string[0]);
-
-    if ((lhs_leading_byte & 0xf0) != 0xe0)
-        return return_combined_strings();
-    if ((rhs_leading_byte & 0xf0) != 0xe0)
-        return return_combined_strings();
-
-    auto high_surrogate = *Utf8View(lhs_string.substring_view(lhs_string.length() - 3)).begin();
-    auto low_surrogate = *Utf8View(rhs_string).begin();
-
-    if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate))
-        return return_combined_strings();
-
-    builder.append(lhs_string.substring_view(0, lhs_string.length() - 3));
-    builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
-    builder.append(rhs_string.substring_view(3));
-
-    return js_string(vm, builder.to_string());
-}
-
 // 13.8.1 The Addition Operator ( + ), https://tc39.es/ecma262/#sec-addition-operator-plus
 ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs)
 {
@@ -1146,7 +1094,7 @@ ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs)
     if (lhs_primitive.is_string() || rhs_primitive.is_string()) {
         auto lhs_string = TRY(lhs_primitive.to_primitive_string(global_object));
         auto rhs_string = TRY(rhs_primitive.to_primitive_string(global_object));
-        return concatenate_strings(global_object, *lhs_string, *rhs_string);
+        return js_rope_string(global_object.vm(), *lhs_string, *rhs_string);
     }
 
     auto lhs_numeric = TRY(lhs_primitive.to_numeric(global_object));
author	Andreas Kling <kling@serenityos.org>	2022-08-05 23:58:47 +0200
committer	Andreas Kling <kling@serenityos.org>	2022-08-06 00:29:15 +0200
commit	64b29eb4596976138b2e4f67f8502419e584f1e4 (patch)
tree	e16a0965e0613b3247e64cb363e654847ef6a20e /Userland/Libraries/LibJS/Runtime
parent	cf62d08b2ad8180249f68cd71a16c437706d000c (diff)
download	serenity-64b29eb4596976138b2e4f67f8502419e584f1e4.zip