summaryrefslogtreecommitdiff
path: root/AK
diff options
context:
space:
mode:
authorAndreas Kling <kling@serenityos.org>2022-12-01 13:27:43 +0100
committerAndreas Kling <kling@serenityos.org>2022-12-06 15:21:26 +0100
commita3e82eaad3b808d96d5d83abab94ecb8124a49ef (patch)
treef7a45e5fff32e4fb9035502cb7610dc296d35036 /AK
parentd50b9165cd43a3ef61c56de563815ba766b86a27 (diff)
downloadserenity-a3e82eaad3b808d96d5d83abab94ecb8124a49ef.zip
AK: Introduce the new String, replacement for DeprecatedString
DeprecatedString (formerly String) has been with us since the start, and it has served us well. However, it has a number of shortcomings that I'd like to address. Some of these issues are hard if not impossible to solve incrementally inside of DeprecatedString, so instead of doing that, let's build a new String class and then incrementally move over to it instead. Problems in DeprecatedString: - It assumes string allocation never fails. This makes it impossible to use in allocation-sensitive contexts, and is the reason we had to ban DeprecatedString from the kernel entirely. - The awkward null state. DeprecatedString can be null. It's different from the empty state, although null strings are considered empty. All code is immediately nicer when using Optional<DeprecatedString> but DeprecatedString came before Optional, which is how we ended up like this. - The encoding of the underlying data is ambiguous. For the most part, we use it as if it's always UTF-8, but there have been cases where we pass around strings in other encodings (e.g ISO8859-1) - operator[] and length() are used to iterate over DeprecatedString one byte at a time. This is done all over the codebase, and will *not* give the right results unless the string is all ASCII. How we solve these issues in the new String: - Functions that may allocate now return ErrorOr<String> so that ENOMEM errors can be passed to the caller. - String has no null state. Use Optional<String> when needed. - String is always UTF-8. This is validated when constructing a String. We may need to add a bypass for this in the future, for cases where you have a known-good string, but for now: validate all the things! - There is no operator[] or length(). You can get the underlying data with bytes(), but for iterating over code points, you should be using an UTF-8 iterator. Furthermore, it has two nifty new features: - String implements a small string optimization (SSO) for strings that can fit entirely within a pointer. This means up to 3 bytes on 32-bit platforms, and 7 bytes on 64-bit platforms. Such small strings will not be heap-allocated. - String can create substrings without making a deep copy of the substring. Instead, the superstring gets +1 refcount from the substring, and it acts like a view into the superstring. To make substrings like this, use the substring_with_shared_superstring() API. One caveat: - String does not guarantee that the underlying data is null-terminated like DeprecatedString does today. While this was nifty in a handful of places where we were calling C functions, it did stand in the way of shared-superstring substrings.
Diffstat (limited to 'AK')
-rw-r--r--AK/CMakeLists.txt1
-rw-r--r--AK/Forward.h2
-rw-r--r--AK/String.cpp337
-rw-r--r--AK/String.h132
-rw-r--r--AK/StringBuilder.cpp6
-rw-r--r--AK/StringBuilder.h2
-rw-r--r--AK/StringUtils.cpp32
-rw-r--r--AK/StringUtils.h2
8 files changed, 513 insertions, 1 deletions
diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt
index 46ec2ee1a4..295e41a328 100644
--- a/AK/CMakeLists.txt
+++ b/AK/CMakeLists.txt
@@ -15,6 +15,7 @@ set(AK_SOURCES
LexicalPath.cpp
Random.cpp
StackInfo.cpp
+ String.cpp
StringBuilder.cpp
StringFloatingPointConversions.cpp
StringImpl.cpp
diff --git a/AK/Forward.h b/AK/Forward.h
index a981707994..31561e6de4 100644
--- a/AK/Forward.h
+++ b/AK/Forward.h
@@ -31,6 +31,7 @@ class StringView;
class Time;
class URL;
class FlyString;
+class String;
class Utf16View;
class Utf32View;
class Utf8CodePointIterator;
@@ -188,6 +189,7 @@ using AK::RefPtr;
using AK::SinglyLinkedList;
using AK::Span;
using AK::StackInfo;
+using AK::String;
using AK::StringBuilder;
using AK::StringImpl;
using AK::StringView;
diff --git a/AK/String.cpp b/AK/String.cpp
new file mode 100644
index 0000000000..ac443a0ce5
--- /dev/null
+++ b/AK/String.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Checked.h>
+#include <AK/Format.h>
+#include <AK/Memory.h>
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+#include <AK/Utf8View.h>
+#include <stdlib.h>
+
+namespace AK {
+
+namespace Detail {
+
+class StringData final : public RefCounted<StringData> {
+public:
+ static ErrorOr<NonnullRefPtr<StringData>> create_uninitialized(size_t, u8*& buffer);
+ static ErrorOr<NonnullRefPtr<StringData>> create_substring(StringData const& superstring, size_t start, size_t byte_count);
+ static ErrorOr<NonnullRefPtr<StringData>> from_utf8(char const* utf8_bytes, size_t);
+
+ struct SubstringData {
+ StringData const* superstring { nullptr };
+ u32 start_offset { 0 };
+ };
+
+ void operator delete(void* ptr);
+
+ ~StringData();
+
+ SubstringData const& substring_data() const
+ {
+ return *reinterpret_cast<SubstringData const*>(m_bytes_or_substring_data);
+ }
+
+ // NOTE: There is no guarantee about null-termination.
+ ReadonlyBytes bytes() const
+ {
+ if (m_substring) {
+ auto const& data = substring_data();
+ return data.superstring->bytes().slice(data.start_offset, m_byte_count);
+ }
+ return { &m_bytes_or_substring_data[0], m_byte_count };
+ }
+
+ StringView bytes_as_string_view() const { return { bytes() }; }
+
+ bool operator==(StringData const& other) const
+ {
+ return bytes_as_string_view() == other.bytes_as_string_view();
+ }
+
+ unsigned hash() const
+ {
+ if (!m_has_hash)
+ compute_hash();
+ return m_hash;
+ }
+
+private:
+ explicit StringData(size_t byte_count);
+ StringData(StringData const& superstring, size_t start, size_t byte_count);
+
+ void compute_hash() const;
+
+ u32 m_byte_count { 0 };
+ mutable unsigned m_hash { 0 };
+ mutable bool m_has_hash { false };
+ bool m_substring { false };
+
+ u8 m_bytes_or_substring_data[0];
+};
+
+void StringData::operator delete(void* ptr)
+{
+ free(ptr);
+}
+
+StringData::StringData(size_t byte_count)
+ : m_byte_count(byte_count)
+{
+}
+
+StringData::StringData(StringData const& superstring, size_t start, size_t byte_count)
+ : m_byte_count(byte_count)
+ , m_substring(true)
+{
+ auto& data = const_cast<SubstringData&>(substring_data());
+ data.start_offset = start;
+ data.superstring = &superstring;
+ superstring.ref();
+}
+
+StringData::~StringData()
+{
+ if (m_substring)
+ substring_data().superstring->unref();
+}
+
+constexpr size_t allocation_size_for_string_data(size_t length)
+{
+ return sizeof(StringData) + (sizeof(char) * length) + sizeof(char);
+}
+
+ErrorOr<NonnullRefPtr<StringData>> StringData::create_uninitialized(size_t byte_count, u8*& buffer)
+{
+ VERIFY(byte_count);
+ void* slot = malloc(allocation_size_for_string_data(byte_count));
+ if (!slot) {
+ return Error::from_errno(ENOMEM);
+ }
+ auto new_string_data = adopt_ref(*new (slot) StringData(byte_count));
+ buffer = const_cast<u8*>(new_string_data->bytes().data());
+ return new_string_data;
+}
+
+ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, size_t byte_count)
+{
+ // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
+ VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
+
+ Utf8View view(StringView(utf8_data, byte_count));
+ if (!view.validate())
+ return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8");
+
+ VERIFY(utf8_data);
+ u8* buffer = nullptr;
+ auto new_string_data = TRY(create_uninitialized(byte_count, buffer));
+ memcpy(buffer, utf8_data, byte_count * sizeof(char));
+ return new_string_data;
+}
+
+ErrorOr<NonnullRefPtr<StringData>> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count)
+{
+ // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
+ VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
+
+ void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData));
+ if (!slot) {
+ return Error::from_errno(ENOMEM);
+ }
+ return adopt_ref(*new (slot) StringData(superstring, start, byte_count));
+}
+
+void StringData::compute_hash() const
+{
+ auto bytes = this->bytes();
+ if (bytes.size() == 0)
+ m_hash = 0;
+ else
+ m_hash = string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
+ m_has_hash = true;
+}
+
+}
+
+String::String(NonnullRefPtr<Detail::StringData> data)
+ : m_data(&data.leak_ref())
+{
+}
+
+String::String(ShortString short_string)
+ : m_short_string(short_string)
+{
+}
+
+String::String(String const& other)
+ : m_data(other.m_data)
+{
+ if (!is_short_string())
+ m_data->ref();
+}
+
+String::String(String&& other)
+ : m_data(exchange(other.m_data, nullptr))
+{
+}
+
+String& String::operator=(String&& other)
+{
+ m_data = exchange(other.m_data, nullptr);
+ return *this;
+}
+
+String& String::operator=(String const& other)
+{
+ if (&other != this) {
+ m_data = other.m_data;
+ if (!is_short_string())
+ m_data->ref();
+ }
+ return *this;
+}
+
+String::~String()
+{
+ if (!is_short_string() && m_data)
+ m_data->unref();
+}
+
+String::String()
+{
+ // This is an empty string, it's always short and zero-length.
+ m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG;
+}
+
+ErrorOr<String> String::from_utf8(StringView view)
+{
+ if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) {
+ ShortString short_string;
+ if (!view.is_empty())
+ memcpy(short_string.storage, view.characters_without_null_termination(), view.length());
+ short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG;
+ return String { short_string };
+ }
+ auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length()));
+ return String { move(data) };
+}
+
+StringView String::bytes_as_string_view() const
+{
+ return StringView(bytes());
+}
+
+ReadonlyBytes String::bytes() const
+{
+ if (is_short_string())
+ return m_short_string.bytes();
+ return m_data->bytes();
+}
+
+bool String::is_empty() const
+{
+ return bytes().size() == 0;
+}
+
+ErrorOr<String> String::vformatted(StringView fmtstr, TypeErasedFormatParams& params)
+{
+ StringBuilder builder;
+ TRY(vformat(builder, fmtstr, params));
+ return builder.to_string();
+}
+
+bool String::operator==(String const& other) const
+{
+ if (is_short_string())
+ return m_data == other.m_data;
+ return bytes_as_string_view() == other.bytes_as_string_view();
+}
+
+bool String::operator==(StringView other) const
+{
+ return bytes_as_string_view() == other;
+}
+
+ErrorOr<String> String::substring_from_byte_offset(size_t start, size_t byte_count) const
+{
+ if (!byte_count)
+ return String {};
+ return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
+}
+
+ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const
+{
+ if (!byte_count)
+ return String {};
+ if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT)
+ return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
+ return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) };
+}
+
+bool String::operator==(char const* c_string) const
+{
+ return bytes_as_string_view() == c_string;
+}
+
+u32 String::hash() const
+{
+ if (is_short_string()) {
+ auto bytes = this->bytes();
+ return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
+ }
+ return m_data->hash();
+}
+
+Utf8View String::code_points() const
+{
+ return Utf8View(bytes_as_string_view());
+}
+
+ErrorOr<void> Formatter<String>::format(FormatBuilder& builder, String const& utf8_string)
+{
+ return Formatter<StringView>::format(builder, utf8_string.bytes_as_string_view());
+}
+
+ErrorOr<String> String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const
+{
+ return StringUtils::replace(*this, needle, replacement, replace_mode);
+}
+
+bool String::is_short_string() const
+{
+ return reinterpret_cast<uintptr_t>(m_data) & SHORT_STRING_FLAG;
+}
+
+ReadonlyBytes String::ShortString::bytes() const
+{
+ return { storage, byte_count() };
+}
+
+size_t String::ShortString::byte_count() const
+{
+ return byte_count_and_short_string_flag >> 1;
+}
+
+unsigned Traits<String>::hash(String const& string)
+{
+ return string.hash();
+}
+
+DeprecatedString String::to_deprecated_string() const
+{
+ return DeprecatedString(bytes_as_string_view());
+}
+
+ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string)
+{
+ Utf8View view(deprecated_string);
+ if (!view.validate())
+ return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8");
+ return String::from_utf8(deprecated_string.view());
+}
+
+}
diff --git a/AK/String.h b/AK/String.h
new file mode 100644
index 0000000000..3ce07bdb89
--- /dev/null
+++ b/AK/String.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Format.h>
+#include <AK/Forward.h>
+#include <AK/RefCounted.h>
+#include <AK/Span.h>
+#include <AK/StringView.h>
+#include <AK/Traits.h>
+#include <AK/Types.h>
+
+namespace AK {
+
+namespace Detail {
+class StringData;
+}
+
+// String is a strongly owned sequence of Unicode code points encoded as UTF-8.
+// The data may or may not be heap-allocated, and may or may not be reference counted.
+// There is no guarantee that the underlying bytes are null-terminated.
+class String {
+public:
+ // NOTE: For short strings, we avoid heap allocations by storing them in the data pointer slot.
+ static constexpr size_t MAX_SHORT_STRING_BYTE_COUNT = sizeof(Detail::StringData*) - 1;
+
+ String(String const&);
+ String(String&&);
+
+ String& operator=(String&&);
+ String& operator=(String const&);
+
+ ~String();
+
+ // Creates an empty (zero-length) String.
+ String();
+
+ // Creates a new String from a sequence of UTF-8 encoded code points.
+ static ErrorOr<String> from_utf8(StringView);
+
+ // Creates a substring with a deep copy of the specified data window.
+ ErrorOr<String> substring_from_byte_offset(size_t start, size_t byte_count) const;
+
+ // Creates a substring that strongly references the origin superstring instead of making a deep copy of the data.
+ ErrorOr<String> substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const;
+
+ // Returns an iterable view over the Unicode code points.
+ [[nodiscard]] Utf8View code_points() const;
+
+ // Returns the underlying UTF-8 encoded bytes.
+ // NOTE: There is no guarantee about null-termination.
+ [[nodiscard]] ReadonlyBytes bytes() const;
+
+ // Returns true if the String is zero-length.
+ [[nodiscard]] bool is_empty() const;
+
+ // Returns a StringView covering the full length of the string. Note that iterating this will go byte-at-a-time, not code-point-at-a-time.
+ [[nodiscard]] StringView bytes_as_string_view() const;
+
+ ErrorOr<String> replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const;
+
+ [[nodiscard]] bool operator==(String const&) const;
+ [[nodiscard]] bool operator!=(String const& other) const { return !(*this == other); }
+
+ [[nodiscard]] bool operator==(StringView) const;
+ [[nodiscard]] bool operator!=(StringView other) const { return !(*this == other); }
+
+ [[nodiscard]] bool operator==(char const* cstring) const;
+ [[nodiscard]] bool operator!=(char const* cstring) const { return !(*this == cstring); }
+
+ [[nodiscard]] u32 hash() const;
+
+ template<typename T>
+ static ErrorOr<String> number(T value)
+ requires IsArithmetic<T>
+ {
+ return formatted("{}", value);
+ }
+
+ static ErrorOr<String> vformatted(StringView fmtstr, TypeErasedFormatParams&);
+
+ template<typename... Parameters>
+ static ErrorOr<String> formatted(CheckedFormatString<Parameters...>&& fmtstr, Parameters const&... parameters)
+ {
+ VariadicFormatParams variadic_format_parameters { parameters... };
+ return vformatted(fmtstr.view(), variadic_format_parameters);
+ }
+
+ // NOTE: This is primarily interesting to unit tests.
+ [[nodiscard]] bool is_short_string() const;
+
+ // FIXME: Remove these once all code has been ported to String
+ [[nodiscard]] DeprecatedString to_deprecated_string() const;
+ static ErrorOr<String> from_deprecated_string(DeprecatedString const&);
+
+private:
+ // NOTE: If the least significant bit of the pointer is set, this is a short string.
+ static constexpr uintptr_t SHORT_STRING_FLAG = 1;
+
+ struct ShortString {
+ ReadonlyBytes bytes() const;
+ size_t byte_count() const;
+
+ // NOTE: This is the byte count shifted left 1 step and or'ed with a 1 (the SHORT_STRING_FLAG)
+ u8 byte_count_and_short_string_flag { 0 };
+ u8 storage[MAX_SHORT_STRING_BYTE_COUNT] = { 0 };
+ };
+
+ explicit String(NonnullRefPtr<Detail::StringData>);
+ explicit String(ShortString);
+
+ union {
+ ShortString m_short_string;
+ Detail::StringData* m_data { nullptr };
+ };
+};
+
+template<>
+struct Traits<String> : public GenericTraits<String> {
+ static unsigned hash(String const&);
+};
+
+template<>
+struct Formatter<String> : Formatter<StringView> {
+ ErrorOr<void> format(FormatBuilder&, String const&);
+};
+
+}
diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp
index 9db7c9f23e..932699e208 100644
--- a/AK/StringBuilder.cpp
+++ b/AK/StringBuilder.cpp
@@ -8,6 +8,7 @@
#include <AK/Checked.h>
#include <AK/PrintfImplementation.h>
#include <AK/StdLibExtras.h>
+#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/UnicodeUtils.h>
@@ -115,6 +116,11 @@ DeprecatedString StringBuilder::build() const
{
return to_deprecated_string();
}
+
+ErrorOr<String> StringBuilder::to_string() const
+{
+ return String::from_utf8(string_view());
+}
#endif
StringView StringBuilder::string_view() const
diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h
index e5c62cb4d0..9bde5b102d 100644
--- a/AK/StringBuilder.h
+++ b/AK/StringBuilder.h
@@ -62,7 +62,9 @@ public:
#ifndef KERNEL
[[nodiscard]] DeprecatedString build() const;
[[nodiscard]] DeprecatedString to_deprecated_string() const;
+ ErrorOr<String> to_string() const;
#endif
+
[[nodiscard]] ByteBuffer to_byte_buffer() const;
[[nodiscard]] StringView string_view() const;
diff --git a/AK/StringUtils.cpp b/AK/StringUtils.cpp
index f72d3ec6ea..58157d148e 100644
--- a/AK/StringUtils.cpp
+++ b/AK/StringUtils.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020, Andreas Kling <awesomekling@gmail.com>
+ * Copyright (c) 2018-2022, Andreas Kling <awesomekling@gmail.com>
* Copyright (c) 2020, Fei Wu <f.eiwu@yahoo.com>
*
* SPDX-License-Identifier: BSD-2-Clause
@@ -9,6 +9,7 @@
#include <AK/MemMem.h>
#include <AK/Memory.h>
#include <AK/Optional.h>
+#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
#include <AK/StringView.h>
@@ -533,6 +534,35 @@ DeprecatedString replace(StringView str, StringView needle, StringView replaceme
replaced_string.append(str.substring_view(last_position, str.length() - last_position));
return replaced_string.build();
}
+
+ErrorOr<String> replace(String const& haystack, StringView needle, StringView replacement, ReplaceMode replace_mode)
+{
+ if (haystack.is_empty())
+ return haystack;
+
+ // FIXME: Propagate Vector allocation failures (or do this without putting positions in a vector)
+ Vector<size_t> positions;
+ if (replace_mode == ReplaceMode::All) {
+ positions = haystack.bytes_as_string_view().find_all(needle);
+ if (!positions.size())
+ return haystack;
+ } else {
+ auto pos = haystack.bytes_as_string_view().find(needle);
+ if (!pos.has_value())
+ return haystack;
+ positions.append(pos.value());
+ }
+
+ StringBuilder replaced_string;
+ size_t last_position = 0;
+ for (auto& position : positions) {
+ replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, position - last_position));
+ replaced_string.append(replacement);
+ last_position = position + needle.length();
+ }
+ replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, haystack.bytes_as_string_view().length() - last_position));
+ return replaced_string.to_string();
+}
#endif
// TODO: Benchmark against KMP (AK/MemMem.h) and switch over if it's faster for short strings too
diff --git a/AK/StringUtils.h b/AK/StringUtils.h
index a531380323..173d8c941b 100644
--- a/AK/StringUtils.h
+++ b/AK/StringUtils.h
@@ -103,6 +103,8 @@ DeprecatedString to_titlecase(StringView);
DeprecatedString invert_case(StringView);
DeprecatedString replace(StringView, StringView needle, StringView replacement, ReplaceMode);
+ErrorOr<String> replace(String const&, StringView needle, StringView replacement, ReplaceMode);
+
size_t count(StringView, StringView needle);
}