/* * Copyright (c) 2020, Andreas Kling * Copyright (c) 2022, Linus Groh * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include namespace JS { PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs) : m_is_rope(true) , m_lhs(&lhs) , m_rhs(&rhs) { } PrimitiveString::PrimitiveString(String string) : m_utf8_string(move(string)) { } PrimitiveString::PrimitiveString(DeprecatedString string) : m_deprecated_string(move(string)) { } PrimitiveString::PrimitiveString(Utf16String string) : m_utf16_string(move(string)) { } PrimitiveString::~PrimitiveString() { if (has_utf8_string()) vm().string_cache().remove(*m_utf8_string); if (has_deprecated_string()) vm().deprecated_string_cache().remove(*m_deprecated_string); } void PrimitiveString::visit_edges(Cell::Visitor& visitor) { Cell::visit_edges(visitor); if (m_is_rope) { visitor.visit(m_lhs); visitor.visit(m_rhs); } } bool PrimitiveString::is_empty() const { if (m_is_rope) { // NOTE: We never make an empty rope string. return false; } if (has_utf16_string()) return m_utf16_string->is_empty(); if (has_utf8_string()) return m_utf8_string->is_empty(); if (has_deprecated_string()) return m_deprecated_string->is_empty(); VERIFY_NOT_REACHED(); } ThrowCompletionOr PrimitiveString::utf8_string() const { auto& vm = this->vm(); TRY(resolve_rope_if_needed()); if (!has_utf8_string()) { if (has_deprecated_string()) m_utf8_string = TRY_OR_THROW_OOM(vm, String::from_utf8(*m_deprecated_string)); else if (has_utf16_string()) m_utf8_string = TRY(m_utf16_string->to_utf8(vm)); else VERIFY_NOT_REACHED(); } return *m_utf8_string; } ThrowCompletionOr PrimitiveString::utf8_string_view() const { (void)TRY(utf8_string()); return m_utf8_string->bytes_as_string_view(); } ThrowCompletionOr PrimitiveString::deprecated_string() const { TRY(resolve_rope_if_needed()); if (!has_deprecated_string()) { if (has_utf8_string()) m_deprecated_string = m_utf8_string->to_deprecated_string(); else if (has_utf16_string()) m_deprecated_string = TRY(m_utf16_string->to_deprecated_string(vm())); else VERIFY_NOT_REACHED(); } return *m_deprecated_string; } ThrowCompletionOr PrimitiveString::utf16_string() const { TRY(resolve_rope_if_needed()); if (!has_utf16_string()) { if (has_utf8_string()) { m_utf16_string = TRY(Utf16String::create(vm(), m_utf8_string->bytes_as_string_view())); } else { VERIFY(has_deprecated_string()); m_utf16_string = TRY(Utf16String::create(vm(), *m_deprecated_string)); } } return *m_utf16_string; } ThrowCompletionOr PrimitiveString::utf16_string_view() const { (void)TRY(utf16_string()); return m_utf16_string->view(); } ThrowCompletionOr> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const { if (property_key.is_symbol()) return Optional {}; if (property_key.is_string()) { if (property_key.as_string() == vm.names.length.as_string()) { auto length = TRY(utf16_string()).length_in_code_units(); return Value(static_cast(length)); } } auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip); if (!index.is_index()) return Optional {}; auto str = TRY(utf16_string_view()); auto length = str.length_in_code_units(); if (length <= index.as_index()) return Optional {}; return create(vm, TRY(Utf16String::create(vm, str.substring_view(index.as_index(), 1)))); } NonnullGCPtr PrimitiveString::create(VM& vm, Utf16String string) { if (string.is_empty()) return vm.empty_string(); if (string.length_in_code_units() == 1) { u16 code_unit = string.code_unit_at(0); if (is_ascii(code_unit)) return vm.single_ascii_character_string(static_cast(code_unit)); } return vm.heap().allocate_without_realm(move(string)); } NonnullGCPtr PrimitiveString::create(VM& vm, String string) { if (string.is_empty()) return vm.empty_string(); if (auto bytes = string.bytes_as_string_view(); bytes.length() == 1) { auto ch = static_cast(bytes[0]); if (is_ascii(ch)) return vm.single_ascii_character_string(ch); } auto& string_cache = vm.string_cache(); if (auto it = string_cache.find(string); it != string_cache.end()) return *it->value; auto new_string = vm.heap().allocate_without_realm(string); string_cache.set(move(string), new_string); return *new_string; } NonnullGCPtr PrimitiveString::create(VM& vm, DeprecatedString string) { if (string.is_empty()) return vm.empty_string(); if (string.length() == 1) { auto ch = static_cast(string.characters()[0]); if (is_ascii(ch)) return vm.single_ascii_character_string(ch); } auto& string_cache = vm.deprecated_string_cache(); auto it = string_cache.find(string); if (it == string_cache.end()) { auto new_string = vm.heap().allocate_without_realm(string); string_cache.set(move(string), new_string); return *new_string; } return *it->value; } NonnullGCPtr PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs) { // We're here to concatenate two strings into a new rope string. // However, if any of them are empty, no rope is required. bool lhs_empty = lhs.is_empty(); bool rhs_empty = rhs.is_empty(); if (lhs_empty && rhs_empty) return vm.empty_string(); if (lhs_empty) return rhs; if (rhs_empty) return lhs; return vm.heap().allocate_without_realm(lhs, rhs); } ThrowCompletionOr PrimitiveString::resolve_rope_if_needed() const { if (!m_is_rope) return {}; auto& vm = this->vm(); // NOTE: Special case for two concatenated UTF-16 strings. // This is here as an optimization, although I'm unsure how valuable it is. if (m_lhs->has_utf16_string() && m_rhs->has_utf16_string()) { auto const& lhs_string = m_lhs->m_utf16_string.value(); auto const& rhs_string = m_rhs->m_utf16_string.value(); Utf16Data combined; TRY_OR_THROW_OOM(vm, combined.try_ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units())); combined.extend(lhs_string.string()); combined.extend(rhs_string.string()); m_utf16_string = TRY(Utf16String::create(vm, move(combined))); m_is_rope = false; m_lhs = nullptr; m_rhs = nullptr; return {}; } // This vector will hold all the pieces of the rope that need to be assembled // into the resolved string. Vector pieces; // NOTE: We traverse the rope tree without using recursion, since we'd run out of // stack space quickly when handling a long sequence of unresolved concatenations. Vector stack; TRY_OR_THROW_OOM(vm, stack.try_append(m_rhs)); TRY_OR_THROW_OOM(vm, stack.try_append(m_lhs)); while (!stack.is_empty()) { auto const* current = stack.take_last(); if (current->m_is_rope) { TRY_OR_THROW_OOM(vm, stack.try_append(current->m_rhs)); TRY_OR_THROW_OOM(vm, stack.try_append(current->m_lhs)); continue; } TRY_OR_THROW_OOM(vm, pieces.try_append(current)); } // Now that we have all the pieces, we can concatenate them using a StringBuilder. ThrowableStringBuilder builder(vm); // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces. PrimitiveString const* previous = nullptr; for (auto const* current : pieces) { if (!previous) { // This is the very first piece, just append it and continue. TRY(builder.append(TRY(current->utf8_string()))); previous = current; continue; } // Get the UTF-8 representations for both strings. auto current_string_as_utf8 = TRY(current->utf8_string_view()); auto previous_string_as_utf8 = TRY(previous->utf8_string_view()); // NOTE: Now we need to look at the end of the previous string and the start // of the current string, to see if they should be combined into a surrogate. // Surrogates encoded as UTF-8 are 3 bytes. if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) { TRY(builder.append(current_string_as_utf8)); previous = current; continue; } // Might the previous string end with a UTF-8 encoded surrogate? if ((static_cast(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) { // If not, just append the current string and continue. TRY(builder.append(current_string_as_utf8)); previous = current; continue; } // Might the current string begin with a UTF-8 encoded surrogate? if ((static_cast(current_string_as_utf8[0]) & 0xf0) != 0xe0) { // If not, just append the current string and continue. TRY(builder.append(current_string_as_utf8)); previous = current; continue; } auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin(); auto low_surrogate = *Utf8View(current_string_as_utf8).begin(); if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) { TRY(builder.append(current_string_as_utf8)); previous = current; continue; } // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point. builder.trim(3); TRY(builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate))); // Append the remaining part of the current string. TRY(builder.append(current_string_as_utf8.substring_view(3))); previous = current; } m_utf8_string = TRY(builder.to_string()); m_is_rope = false; m_lhs = nullptr; m_rhs = nullptr; return {}; } }