diff options
author | Andreas Kling <kling@serenityos.org> | 2022-12-01 13:27:43 +0100 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2022-12-06 15:21:26 +0100 |
commit | a3e82eaad3b808d96d5d83abab94ecb8124a49ef (patch) | |
tree | f7a45e5fff32e4fb9035502cb7610dc296d35036 /AK | |
parent | d50b9165cd43a3ef61c56de563815ba766b86a27 (diff) | |
download | serenity-a3e82eaad3b808d96d5d83abab94ecb8124a49ef.zip |
AK: Introduce the new String, replacement for DeprecatedString
DeprecatedString (formerly String) has been with us since the start,
and it has served us well. However, it has a number of shortcomings
that I'd like to address.
Some of these issues are hard if not impossible to solve incrementally
inside of DeprecatedString, so instead of doing that, let's build a new
String class and then incrementally move over to it instead.
Problems in DeprecatedString:
- It assumes string allocation never fails. This makes it impossible
to use in allocation-sensitive contexts, and is the reason we had to
ban DeprecatedString from the kernel entirely.
- The awkward null state. DeprecatedString can be null. It's different
from the empty state, although null strings are considered empty.
All code is immediately nicer when using Optional<DeprecatedString>
but DeprecatedString came before Optional, which is how we ended up
like this.
- The encoding of the underlying data is ambiguous. For the most part,
we use it as if it's always UTF-8, but there have been cases where
we pass around strings in other encodings (e.g ISO8859-1)
- operator[] and length() are used to iterate over DeprecatedString one
byte at a time. This is done all over the codebase, and will *not*
give the right results unless the string is all ASCII.
How we solve these issues in the new String:
- Functions that may allocate now return ErrorOr<String> so that ENOMEM
errors can be passed to the caller.
- String has no null state. Use Optional<String> when needed.
- String is always UTF-8. This is validated when constructing a String.
We may need to add a bypass for this in the future, for cases where
you have a known-good string, but for now: validate all the things!
- There is no operator[] or length(). You can get the underlying data
with bytes(), but for iterating over code points, you should be using
an UTF-8 iterator.
Furthermore, it has two nifty new features:
- String implements a small string optimization (SSO) for strings that
can fit entirely within a pointer. This means up to 3 bytes on 32-bit
platforms, and 7 bytes on 64-bit platforms. Such small strings will
not be heap-allocated.
- String can create substrings without making a deep copy of the
substring. Instead, the superstring gets +1 refcount from the
substring, and it acts like a view into the superstring. To make
substrings like this, use the substring_with_shared_superstring() API.
One caveat:
- String does not guarantee that the underlying data is null-terminated
like DeprecatedString does today. While this was nifty in a handful of
places where we were calling C functions, it did stand in the way of
shared-superstring substrings.
Diffstat (limited to 'AK')
-rw-r--r-- | AK/CMakeLists.txt | 1 | ||||
-rw-r--r-- | AK/Forward.h | 2 | ||||
-rw-r--r-- | AK/String.cpp | 337 | ||||
-rw-r--r-- | AK/String.h | 132 | ||||
-rw-r--r-- | AK/StringBuilder.cpp | 6 | ||||
-rw-r--r-- | AK/StringBuilder.h | 2 | ||||
-rw-r--r-- | AK/StringUtils.cpp | 32 | ||||
-rw-r--r-- | AK/StringUtils.h | 2 |
8 files changed, 513 insertions, 1 deletions
diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt index 46ec2ee1a4..295e41a328 100644 --- a/AK/CMakeLists.txt +++ b/AK/CMakeLists.txt @@ -15,6 +15,7 @@ set(AK_SOURCES LexicalPath.cpp Random.cpp StackInfo.cpp + String.cpp StringBuilder.cpp StringFloatingPointConversions.cpp StringImpl.cpp diff --git a/AK/Forward.h b/AK/Forward.h index a981707994..31561e6de4 100644 --- a/AK/Forward.h +++ b/AK/Forward.h @@ -31,6 +31,7 @@ class StringView; class Time; class URL; class FlyString; +class String; class Utf16View; class Utf32View; class Utf8CodePointIterator; @@ -188,6 +189,7 @@ using AK::RefPtr; using AK::SinglyLinkedList; using AK::Span; using AK::StackInfo; +using AK::String; using AK::StringBuilder; using AK::StringImpl; using AK::StringView; diff --git a/AK/String.cpp b/AK/String.cpp new file mode 100644 index 0000000000..ac443a0ce5 --- /dev/null +++ b/AK/String.cpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/Checked.h> +#include <AK/Format.h> +#include <AK/Memory.h> +#include <AK/String.h> +#include <AK/StringBuilder.h> +#include <AK/Utf8View.h> +#include <stdlib.h> + +namespace AK { + +namespace Detail { + +class StringData final : public RefCounted<StringData> { +public: + static ErrorOr<NonnullRefPtr<StringData>> create_uninitialized(size_t, u8*& buffer); + static ErrorOr<NonnullRefPtr<StringData>> create_substring(StringData const& superstring, size_t start, size_t byte_count); + static ErrorOr<NonnullRefPtr<StringData>> from_utf8(char const* utf8_bytes, size_t); + + struct SubstringData { + StringData const* superstring { nullptr }; + u32 start_offset { 0 }; + }; + + void operator delete(void* ptr); + + ~StringData(); + + SubstringData const& substring_data() const + { + return *reinterpret_cast<SubstringData const*>(m_bytes_or_substring_data); + } + + // NOTE: There is no guarantee about null-termination. + ReadonlyBytes bytes() const + { + if (m_substring) { + auto const& data = substring_data(); + return data.superstring->bytes().slice(data.start_offset, m_byte_count); + } + return { &m_bytes_or_substring_data[0], m_byte_count }; + } + + StringView bytes_as_string_view() const { return { bytes() }; } + + bool operator==(StringData const& other) const + { + return bytes_as_string_view() == other.bytes_as_string_view(); + } + + unsigned hash() const + { + if (!m_has_hash) + compute_hash(); + return m_hash; + } + +private: + explicit StringData(size_t byte_count); + StringData(StringData const& superstring, size_t start, size_t byte_count); + + void compute_hash() const; + + u32 m_byte_count { 0 }; + mutable unsigned m_hash { 0 }; + mutable bool m_has_hash { false }; + bool m_substring { false }; + + u8 m_bytes_or_substring_data[0]; +}; + +void StringData::operator delete(void* ptr) +{ + free(ptr); +} + +StringData::StringData(size_t byte_count) + : m_byte_count(byte_count) +{ +} + +StringData::StringData(StringData const& superstring, size_t start, size_t byte_count) + : m_byte_count(byte_count) + , m_substring(true) +{ + auto& data = const_cast<SubstringData&>(substring_data()); + data.start_offset = start; + data.superstring = &superstring; + superstring.ref(); +} + +StringData::~StringData() +{ + if (m_substring) + substring_data().superstring->unref(); +} + +constexpr size_t allocation_size_for_string_data(size_t length) +{ + return sizeof(StringData) + (sizeof(char) * length) + sizeof(char); +} + +ErrorOr<NonnullRefPtr<StringData>> StringData::create_uninitialized(size_t byte_count, u8*& buffer) +{ + VERIFY(byte_count); + void* slot = malloc(allocation_size_for_string_data(byte_count)); + if (!slot) { + return Error::from_errno(ENOMEM); + } + auto new_string_data = adopt_ref(*new (slot) StringData(byte_count)); + buffer = const_cast<u8*>(new_string_data->bytes().data()); + return new_string_data; +} + +ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, size_t byte_count) +{ + // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. + VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); + + Utf8View view(StringView(utf8_data, byte_count)); + if (!view.validate()) + return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8"); + + VERIFY(utf8_data); + u8* buffer = nullptr; + auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); + memcpy(buffer, utf8_data, byte_count * sizeof(char)); + return new_string_data; +} + +ErrorOr<NonnullRefPtr<StringData>> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count) +{ + // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. + VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); + + void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData)); + if (!slot) { + return Error::from_errno(ENOMEM); + } + return adopt_ref(*new (slot) StringData(superstring, start, byte_count)); +} + +void StringData::compute_hash() const +{ + auto bytes = this->bytes(); + if (bytes.size() == 0) + m_hash = 0; + else + m_hash = string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size()); + m_has_hash = true; +} + +} + +String::String(NonnullRefPtr<Detail::StringData> data) + : m_data(&data.leak_ref()) +{ +} + +String::String(ShortString short_string) + : m_short_string(short_string) +{ +} + +String::String(String const& other) + : m_data(other.m_data) +{ + if (!is_short_string()) + m_data->ref(); +} + +String::String(String&& other) + : m_data(exchange(other.m_data, nullptr)) +{ +} + +String& String::operator=(String&& other) +{ + m_data = exchange(other.m_data, nullptr); + return *this; +} + +String& String::operator=(String const& other) +{ + if (&other != this) { + m_data = other.m_data; + if (!is_short_string()) + m_data->ref(); + } + return *this; +} + +String::~String() +{ + if (!is_short_string() && m_data) + m_data->unref(); +} + +String::String() +{ + // This is an empty string, it's always short and zero-length. + m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG; +} + +ErrorOr<String> String::from_utf8(StringView view) +{ + if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) { + ShortString short_string; + if (!view.is_empty()) + memcpy(short_string.storage, view.characters_without_null_termination(), view.length()); + short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG; + return String { short_string }; + } + auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length())); + return String { move(data) }; +} + +StringView String::bytes_as_string_view() const +{ + return StringView(bytes()); +} + +ReadonlyBytes String::bytes() const +{ + if (is_short_string()) + return m_short_string.bytes(); + return m_data->bytes(); +} + +bool String::is_empty() const +{ + return bytes().size() == 0; +} + +ErrorOr<String> String::vformatted(StringView fmtstr, TypeErasedFormatParams& params) +{ + StringBuilder builder; + TRY(vformat(builder, fmtstr, params)); + return builder.to_string(); +} + +bool String::operator==(String const& other) const +{ + if (is_short_string()) + return m_data == other.m_data; + return bytes_as_string_view() == other.bytes_as_string_view(); +} + +bool String::operator==(StringView other) const +{ + return bytes_as_string_view() == other; +} + +ErrorOr<String> String::substring_from_byte_offset(size_t start, size_t byte_count) const +{ + if (!byte_count) + return String {}; + return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); +} + +ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const +{ + if (!byte_count) + return String {}; + if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) + return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); + return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) }; +} + +bool String::operator==(char const* c_string) const +{ + return bytes_as_string_view() == c_string; +} + +u32 String::hash() const +{ + if (is_short_string()) { + auto bytes = this->bytes(); + return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size()); + } + return m_data->hash(); +} + +Utf8View String::code_points() const +{ + return Utf8View(bytes_as_string_view()); +} + +ErrorOr<void> Formatter<String>::format(FormatBuilder& builder, String const& utf8_string) +{ + return Formatter<StringView>::format(builder, utf8_string.bytes_as_string_view()); +} + +ErrorOr<String> String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const +{ + return StringUtils::replace(*this, needle, replacement, replace_mode); +} + +bool String::is_short_string() const +{ + return reinterpret_cast<uintptr_t>(m_data) & SHORT_STRING_FLAG; +} + +ReadonlyBytes String::ShortString::bytes() const +{ + return { storage, byte_count() }; +} + +size_t String::ShortString::byte_count() const +{ + return byte_count_and_short_string_flag >> 1; +} + +unsigned Traits<String>::hash(String const& string) +{ + return string.hash(); +} + +DeprecatedString String::to_deprecated_string() const +{ + return DeprecatedString(bytes_as_string_view()); +} + +ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string) +{ + Utf8View view(deprecated_string); + if (!view.validate()) + return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8"); + return String::from_utf8(deprecated_string.view()); +} + +} diff --git a/AK/String.h b/AK/String.h new file mode 100644 index 0000000000..3ce07bdb89 --- /dev/null +++ b/AK/String.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/Format.h> +#include <AK/Forward.h> +#include <AK/RefCounted.h> +#include <AK/Span.h> +#include <AK/StringView.h> +#include <AK/Traits.h> +#include <AK/Types.h> + +namespace AK { + +namespace Detail { +class StringData; +} + +// String is a strongly owned sequence of Unicode code points encoded as UTF-8. +// The data may or may not be heap-allocated, and may or may not be reference counted. +// There is no guarantee that the underlying bytes are null-terminated. +class String { +public: + // NOTE: For short strings, we avoid heap allocations by storing them in the data pointer slot. + static constexpr size_t MAX_SHORT_STRING_BYTE_COUNT = sizeof(Detail::StringData*) - 1; + + String(String const&); + String(String&&); + + String& operator=(String&&); + String& operator=(String const&); + + ~String(); + + // Creates an empty (zero-length) String. + String(); + + // Creates a new String from a sequence of UTF-8 encoded code points. + static ErrorOr<String> from_utf8(StringView); + + // Creates a substring with a deep copy of the specified data window. + ErrorOr<String> substring_from_byte_offset(size_t start, size_t byte_count) const; + + // Creates a substring that strongly references the origin superstring instead of making a deep copy of the data. + ErrorOr<String> substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const; + + // Returns an iterable view over the Unicode code points. + [[nodiscard]] Utf8View code_points() const; + + // Returns the underlying UTF-8 encoded bytes. + // NOTE: There is no guarantee about null-termination. + [[nodiscard]] ReadonlyBytes bytes() const; + + // Returns true if the String is zero-length. + [[nodiscard]] bool is_empty() const; + + // Returns a StringView covering the full length of the string. Note that iterating this will go byte-at-a-time, not code-point-at-a-time. + [[nodiscard]] StringView bytes_as_string_view() const; + + ErrorOr<String> replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const; + + [[nodiscard]] bool operator==(String const&) const; + [[nodiscard]] bool operator!=(String const& other) const { return !(*this == other); } + + [[nodiscard]] bool operator==(StringView) const; + [[nodiscard]] bool operator!=(StringView other) const { return !(*this == other); } + + [[nodiscard]] bool operator==(char const* cstring) const; + [[nodiscard]] bool operator!=(char const* cstring) const { return !(*this == cstring); } + + [[nodiscard]] u32 hash() const; + + template<typename T> + static ErrorOr<String> number(T value) + requires IsArithmetic<T> + { + return formatted("{}", value); + } + + static ErrorOr<String> vformatted(StringView fmtstr, TypeErasedFormatParams&); + + template<typename... Parameters> + static ErrorOr<String> formatted(CheckedFormatString<Parameters...>&& fmtstr, Parameters const&... parameters) + { + VariadicFormatParams variadic_format_parameters { parameters... }; + return vformatted(fmtstr.view(), variadic_format_parameters); + } + + // NOTE: This is primarily interesting to unit tests. + [[nodiscard]] bool is_short_string() const; + + // FIXME: Remove these once all code has been ported to String + [[nodiscard]] DeprecatedString to_deprecated_string() const; + static ErrorOr<String> from_deprecated_string(DeprecatedString const&); + +private: + // NOTE: If the least significant bit of the pointer is set, this is a short string. + static constexpr uintptr_t SHORT_STRING_FLAG = 1; + + struct ShortString { + ReadonlyBytes bytes() const; + size_t byte_count() const; + + // NOTE: This is the byte count shifted left 1 step and or'ed with a 1 (the SHORT_STRING_FLAG) + u8 byte_count_and_short_string_flag { 0 }; + u8 storage[MAX_SHORT_STRING_BYTE_COUNT] = { 0 }; + }; + + explicit String(NonnullRefPtr<Detail::StringData>); + explicit String(ShortString); + + union { + ShortString m_short_string; + Detail::StringData* m_data { nullptr }; + }; +}; + +template<> +struct Traits<String> : public GenericTraits<String> { + static unsigned hash(String const&); +}; + +template<> +struct Formatter<String> : Formatter<StringView> { + ErrorOr<void> format(FormatBuilder&, String const&); +}; + +} diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 9db7c9f23e..932699e208 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -8,6 +8,7 @@ #include <AK/Checked.h> #include <AK/PrintfImplementation.h> #include <AK/StdLibExtras.h> +#include <AK/String.h> #include <AK/StringBuilder.h> #include <AK/StringView.h> #include <AK/UnicodeUtils.h> @@ -115,6 +116,11 @@ DeprecatedString StringBuilder::build() const { return to_deprecated_string(); } + +ErrorOr<String> StringBuilder::to_string() const +{ + return String::from_utf8(string_view()); +} #endif StringView StringBuilder::string_view() const diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h index e5c62cb4d0..9bde5b102d 100644 --- a/AK/StringBuilder.h +++ b/AK/StringBuilder.h @@ -62,7 +62,9 @@ public: #ifndef KERNEL [[nodiscard]] DeprecatedString build() const; [[nodiscard]] DeprecatedString to_deprecated_string() const; + ErrorOr<String> to_string() const; #endif + [[nodiscard]] ByteBuffer to_byte_buffer() const; [[nodiscard]] StringView string_view() const; diff --git a/AK/StringUtils.cpp b/AK/StringUtils.cpp index f72d3ec6ea..58157d148e 100644 --- a/AK/StringUtils.cpp +++ b/AK/StringUtils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, Andreas Kling <awesomekling@gmail.com> + * Copyright (c) 2018-2022, Andreas Kling <awesomekling@gmail.com> * Copyright (c) 2020, Fei Wu <f.eiwu@yahoo.com> * * SPDX-License-Identifier: BSD-2-Clause @@ -9,6 +9,7 @@ #include <AK/MemMem.h> #include <AK/Memory.h> #include <AK/Optional.h> +#include <AK/String.h> #include <AK/StringBuilder.h> #include <AK/StringUtils.h> #include <AK/StringView.h> @@ -533,6 +534,35 @@ DeprecatedString replace(StringView str, StringView needle, StringView replaceme replaced_string.append(str.substring_view(last_position, str.length() - last_position)); return replaced_string.build(); } + +ErrorOr<String> replace(String const& haystack, StringView needle, StringView replacement, ReplaceMode replace_mode) +{ + if (haystack.is_empty()) + return haystack; + + // FIXME: Propagate Vector allocation failures (or do this without putting positions in a vector) + Vector<size_t> positions; + if (replace_mode == ReplaceMode::All) { + positions = haystack.bytes_as_string_view().find_all(needle); + if (!positions.size()) + return haystack; + } else { + auto pos = haystack.bytes_as_string_view().find(needle); + if (!pos.has_value()) + return haystack; + positions.append(pos.value()); + } + + StringBuilder replaced_string; + size_t last_position = 0; + for (auto& position : positions) { + replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, position - last_position)); + replaced_string.append(replacement); + last_position = position + needle.length(); + } + replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, haystack.bytes_as_string_view().length() - last_position)); + return replaced_string.to_string(); +} #endif // TODO: Benchmark against KMP (AK/MemMem.h) and switch over if it's faster for short strings too diff --git a/AK/StringUtils.h b/AK/StringUtils.h index a531380323..173d8c941b 100644 --- a/AK/StringUtils.h +++ b/AK/StringUtils.h @@ -103,6 +103,8 @@ DeprecatedString to_titlecase(StringView); DeprecatedString invert_case(StringView); DeprecatedString replace(StringView, StringView needle, StringView replacement, ReplaceMode); +ErrorOr<String> replace(String const&, StringView needle, StringView replacement, ReplaceMode); + size_t count(StringView, StringView needle); } |