summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Tests/LibUnicode/TestUnicodeCharacterTypes.cpp93
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.cpp7
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.h2
-rw-r--r--Userland/Libraries/LibUnicode/UnicodeUtils.cpp62
-rw-r--r--Userland/Libraries/LibUnicode/UnicodeUtils.h1
5 files changed, 165 insertions, 0 deletions
diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
index dd6f5c4654..971461ba8e 100644
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@@ -74,6 +74,27 @@ TEST_CASE(to_unicode_titlecase)
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj"
EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz"
+
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(""sv)), ""sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" "sv)), " "sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" - "sv)), " - "sv);
+
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a"sv)), "A"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("A"sv)), "A"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" a"sv)), " A"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a "sv)), "A "sv);
+
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab"sv)), "Ab"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("Ab"sv)), "Ab"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("aB"sv)), "Ab"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("AB"sv)), "Ab"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" ab"sv)), " Ab"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab "sv)), "Ab "sv);
+
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'Ar'"sv);
+ EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv);
}
TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
@@ -382,6 +403,78 @@ TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted)
EXPECT_EQ(result, "J"sv);
}
+TEST_CASE(to_unicode_titlecase_unconditional_special_casing)
+{
+ // LATIN SMALL LETTER SHARP S
+ auto result = MUST(Unicode::to_unicode_titlecase_full("\u00DF"sv));
+ EXPECT_EQ(result, "\u0053\u0073"sv);
+
+ // LATIN CAPITAL LETTER I WITH DOT ABOVE
+ result = MUST(Unicode::to_unicode_titlecase_full("\u0130"sv));
+ EXPECT_EQ(result, "\u0130"sv);
+
+ // LATIN SMALL LIGATURE FF
+ result = MUST(Unicode::to_unicode_titlecase_full("\uFB00"sv));
+ EXPECT_EQ(result, "\u0046\u0066"sv);
+
+ // LATIN SMALL LIGATURE FI
+ result = MUST(Unicode::to_unicode_titlecase_full("\uFB01"sv));
+ EXPECT_EQ(result, "\u0046\u0069"sv);
+
+ // LATIN SMALL LIGATURE FL
+ result = MUST(Unicode::to_unicode_titlecase_full("\uFB02"sv));
+ EXPECT_EQ(result, "\u0046\u006C"sv);
+
+ // LATIN SMALL LIGATURE FFI
+ result = MUST(Unicode::to_unicode_titlecase_full("\uFB03"sv));
+ EXPECT_EQ(result, "\u0046\u0066\u0069"sv);
+
+ // LATIN SMALL LIGATURE FFL
+ result = MUST(Unicode::to_unicode_titlecase_full("\uFB04"sv));
+ EXPECT_EQ(result, "\u0046\u0066\u006C"sv);
+
+ // LATIN SMALL LIGATURE LONG S T
+ result = MUST(Unicode::to_unicode_titlecase_full("\uFB05"sv));
+ EXPECT_EQ(result, "\u0053\u0074"sv);
+
+ // LATIN SMALL LIGATURE ST
+ result = MUST(Unicode::to_unicode_titlecase_full("\uFB06"sv));
+ EXPECT_EQ(result, "\u0053\u0074"sv);
+
+ // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ result = MUST(Unicode::to_unicode_titlecase_full("\u0390"sv));
+ EXPECT_EQ(result, "\u0399\u0308\u0301"sv);
+
+ // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+ result = MUST(Unicode::to_unicode_titlecase_full("\u03B0"sv));
+ EXPECT_EQ(result, "\u03A5\u0308\u0301"sv);
+
+ // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+ result = MUST(Unicode::to_unicode_titlecase_full("\u1FB7"sv));
+ EXPECT_EQ(result, "\u0391\u0342\u0345"sv);
+
+ // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+ result = MUST(Unicode::to_unicode_titlecase_full("\u1FC7"sv));
+ EXPECT_EQ(result, "\u0397\u0342\u0345"sv);
+
+ // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+ result = MUST(Unicode::to_unicode_titlecase_full("\u1FF7"sv));
+ EXPECT_EQ(result, "\u03A9\u0342\u0345"sv);
+}
+
+TEST_CASE(to_unicode_titlecase_special_casing_i)
+{
+ // LATIN SMALL LETTER I
+ auto result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "en"sv));
+ EXPECT_EQ(result, "I"sv);
+
+ result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "az"sv));
+ EXPECT_EQ(result, "\u0130"sv);
+
+ result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "tr"sv));
+ EXPECT_EQ(result, "\u0130"sv);
+}
+
TEST_CASE(general_category)
{
auto general_category = [](StringView name) {
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
index 4ab8b9a691..3f1b62b95c 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -57,6 +57,13 @@ ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView string, Optional<
return builder.to_deprecated_string();
}
+ErrorOr<String> to_unicode_titlecase_full(StringView string, Optional<StringView> const& locale)
+{
+ StringBuilder builder;
+ TRY(Detail::build_titlecase_string(Utf8View { string }, builder, locale));
+ return builder.to_string();
+}
+
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h
index 04ce644d30..43f3c8f6e9 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -10,6 +10,7 @@
#include <AK/Forward.h>
#include <AK/Optional.h>
#include <AK/Span.h>
+#include <AK/String.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h>
@@ -42,6 +43,7 @@ u32 to_unicode_titlecase(u32 code_point);
ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView, Optional<StringView> const& locale = {});
ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringView> const& locale = {});
+ErrorOr<String> to_unicode_titlecase_full(StringView, Optional<StringView> const& locale = {});
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
index 992122690a..e8c03a0fb6 100644
--- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
+++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
@@ -249,4 +249,66 @@ ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[ma
#endif
}
+ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
+{
+#if ENABLE_UNICODE_DATA
+ // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
+ // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
+ // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
+ // F and the following word boundary to Lowercase_Mapping(C).
+
+ auto boundaries = find_word_segmentation_boundaries(code_points);
+ if (boundaries.is_empty())
+ return {};
+
+ auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
+ auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
+ auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
+
+ for (; it != end; ++it) {
+ if (code_point_has_property(*it, Property::Cased))
+ return it;
+ }
+
+ return {};
+ };
+
+ auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
+ auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
+ if (!special_casing) {
+ TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
+ return {};
+ }
+
+ for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
+ TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
+ return {};
+ };
+
+ for (size_t i = 0; i < boundaries.size() - 1; ++i) {
+ auto boundary = boundaries[i];
+ auto next_boundary = boundaries[i + 1];
+
+ if (auto it = first_cased_code_point_after_boundary(boundary, next_boundary); it.has_value()) {
+ auto code_point = *it.value();
+ auto code_point_offset = code_points.byte_offset_of(*it);
+ auto code_point_length = it->underlying_code_point_length_in_bytes();
+
+ auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
+ TRY(builder.try_append(caseless_code_points.as_string()));
+
+ TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
+ boundary = code_point_offset + code_point_length;
+ }
+
+ auto substring_to_lowercase = code_points.substring_view(boundary, next_boundary - boundary);
+ TRY(build_lowercase_string(substring_to_lowercase, builder, locale));
+ }
+
+ return {};
+#else
+ return Error::from_string_literal("Unicode data has been disabled");
+#endif
+}
+
}
diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h
index 1770c385a7..5e9bcbf2a7 100644
--- a/Userland/Libraries/LibUnicode/UnicodeUtils.h
+++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h
@@ -16,5 +16,6 @@ namespace Unicode::Detail {
ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
+ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
}