diff options
author | Timothy Flynn <trflynn89@pm.me> | 2023-01-16 11:22:01 -0500 |
---|---|---|
committer | Tim Flynn <trflynn89@pm.me> | 2023-01-16 18:33:44 -0500 |
commit | bc51017a03087057dc8e8f437b4049f2ab7ebba1 (patch) | |
tree | 043d1b7c361016bdd7fb86d1669252312fef261b /Userland/Libraries | |
parent | b562348d316cbd3a646f33490809c356c591265d (diff) | |
download | serenity-bc51017a03087057dc8e8f437b4049f2ab7ebba1.zip |
LibUnicode: Support full case folding for titlecasing a string
Unicode declares that to titlecase a string, the first cased code point
after each word boundary should be transformed to its titlecase mapping.
All other codepoints are transformed to their lowercase mapping.
Diffstat (limited to 'Userland/Libraries')
-rw-r--r-- | Userland/Libraries/LibUnicode/CharacterTypes.cpp | 7 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/CharacterTypes.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/UnicodeUtils.cpp | 62 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/UnicodeUtils.h | 1 |
4 files changed, 72 insertions, 0 deletions
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 4ab8b9a691..3f1b62b95c 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -57,6 +57,13 @@ ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView string, Optional< return builder.to_deprecated_string(); } +ErrorOr<String> to_unicode_titlecase_full(StringView string, Optional<StringView> const& locale) +{ + StringBuilder builder; + TRY(Detail::build_titlecase_string(Utf8View { string }, builder, locale)); + return builder.to_string(); +} + Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; } bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; } Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 04ce644d30..43f3c8f6e9 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -10,6 +10,7 @@ #include <AK/Forward.h> #include <AK/Optional.h> #include <AK/Span.h> +#include <AK/String.h> #include <AK/Types.h> #include <AK/Vector.h> #include <LibUnicode/Forward.h> @@ -42,6 +43,7 @@ u32 to_unicode_titlecase(u32 code_point); ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView, Optional<StringView> const& locale = {}); ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringView> const& locale = {}); +ErrorOr<String> to_unicode_titlecase_full(StringView, Optional<StringView> const& locale = {}); Optional<GeneralCategory> general_category_from_string(StringView); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp index 992122690a..e8c03a0fb6 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -249,4 +249,66 @@ ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[ma #endif } +ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale) +{ +#if ENABLE_UNICODE_DATA + // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29, + // โUnicode Text Segmentation.โ For each word boundary, find the first cased character F following + // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between + // F and the following word boundary to Lowercase_Mapping(C). + + auto boundaries = find_word_segmentation_boundaries(code_points); + if (boundaries.is_empty()) + return {}; + + auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> { + auto it = code_points.iterator_at_byte_offset_without_validation(boundary); + auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary); + + for (; it != end; ++it) { + if (code_point_has_property(*it, Property::Cased)) + return it; + } + + return {}; + }; + + auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> { + auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); + if (!special_casing) { + TRY(builder.try_append_code_point(to_unicode_titlecase(code_point))); + return {}; + } + + for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i) + TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i])); + return {}; + }; + + for (size_t i = 0; i < boundaries.size() - 1; ++i) { + auto boundary = boundaries[i]; + auto next_boundary = boundaries[i + 1]; + + if (auto it = first_cased_code_point_after_boundary(boundary, next_boundary); it.has_value()) { + auto code_point = *it.value(); + auto code_point_offset = code_points.byte_offset_of(*it); + auto code_point_length = it->underlying_code_point_length_in_bytes(); + + auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary); + TRY(builder.try_append(caseless_code_points.as_string())); + + TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length)); + boundary = code_point_offset + code_point_length; + } + + auto substring_to_lowercase = code_points.substring_view(boundary, next_boundary - boundary); + TRY(build_lowercase_string(substring_to_lowercase, builder, locale)); + } + + return {}; +#else + return Error::from_string_literal("Unicode data has been disabled"); +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h index 1770c385a7..5e9bcbf2a7 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.h +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h @@ -16,5 +16,6 @@ namespace Unicode::Detail { ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale); ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale); +ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale); } |