summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Meta/CMake/unicode_data.cmake6
-rw-r--r--Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp129
-rw-r--r--Tests/LibUnicode/TestUnicodeCharacterTypes.cpp27
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.cpp7
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.h1
-rw-r--r--Userland/Libraries/LibUnicode/UnicodeUtils.cpp41
-rw-r--r--Userland/Libraries/LibUnicode/UnicodeUtils.h1
7 files changed, 207 insertions, 5 deletions
diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake
index 6914b99a11..dc97c6362b 100644
--- a/Meta/CMake/unicode_data.cmake
+++ b/Meta/CMake/unicode_data.cmake
@@ -13,6 +13,9 @@ set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}")
set(SPECIAL_CASING_SOURCE "SpecialCasing.txt")
set(SPECIAL_CASING_PATH "${UCD_PATH}/${SPECIAL_CASING_SOURCE}")
+set(CASE_FOLDING_SOURCE "CaseFolding.txt")
+set(CASE_FOLDING_PATH "${UCD_PATH}/${CASE_FOLDING_SOURCE}")
+
set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt")
set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}")
@@ -71,6 +74,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SPECIAL_CASING_SOURCE}" "${SPECIAL_CASING_PATH}")
+ extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${CASE_FOLDING_SOURCE}" "${CASE_FOLDING_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}")
@@ -105,7 +109,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UCD_VERSION_FILE}"
"${UNICODE_DATA_HEADER}"
"${UNICODE_DATA_IMPLEMENTATION}"
- arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
+ arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
)
invoke_generator(
"EmojiData"
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
index 62947ac326..f79b2bb889 100644
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
@@ -41,6 +41,13 @@ struct SpecialCasing {
DeprecatedString condition;
};
+// https://www.unicode.org/reports/tr44/#CaseFolding.txt
+struct CaseFolding {
+ u32 code_point { 0 };
+ StringView status { "Common"sv };
+ Vector<u32> mapping { 0 };
+};
+
// https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
struct CodePointDecomposition {
// `tag` is a string since it's used for codegen as an enum value.
@@ -90,6 +97,7 @@ struct CodePointData {
Optional<u32> simple_lowercase_mapping;
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
+ Vector<u32> case_folding_indices;
};
struct BlockName {
@@ -117,6 +125,12 @@ struct UnicodeData {
Vector<DeprecatedString> conditions;
Vector<DeprecatedString> locales;
+ Vector<CaseFolding> case_folding;
+ u32 code_points_with_case_folding { 0 };
+ u32 largest_case_folding_mapping_size { 0 };
+ u32 largest_case_folding_size { 0 };
+ Vector<StringView> statuses;
+
Vector<CodePointData> code_point_data;
HashMap<u32, size_t> code_point_abbreviations;
@@ -276,6 +290,54 @@ static ErrorOr<void> parse_special_casing(Core::Stream::BufferedFile& file, Unic
return {};
}
+static ErrorOr<void> parse_case_folding(Core::Stream::BufferedFile& file, UnicodeData& unicode_data)
+{
+ Array<u8, 1024> buffer;
+
+ while (TRY(file.can_read_line())) {
+ auto line = TRY(file.read_line(buffer));
+ if (line.is_empty() || line.starts_with('#'))
+ continue;
+
+ auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
+ VERIFY(segments.size() == 4);
+
+ CaseFolding folding {};
+ folding.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
+ folding.mapping = parse_code_point_list(segments[2]);
+
+ switch (segments[1].trim_whitespace()[0]) {
+ case 'C':
+ folding.status = "Common"sv;
+ break;
+ case 'F':
+ folding.status = "Full"sv;
+ break;
+ case 'S':
+ folding.status = "Simple"sv;
+ break;
+ case 'T':
+ folding.status = "Special"sv;
+ break;
+ }
+
+ unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size());
+
+ if (!unicode_data.statuses.contains_slow(folding.status))
+ unicode_data.statuses.append(folding.status);
+
+ unicode_data.case_folding.append(move(folding));
+ }
+
+ quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) {
+ if (lhs.code_point != rhs.code_point)
+ return lhs.code_point < rhs.code_point;
+ return lhs.status < rhs.status;
+ });
+
+ return {};
+}
+
static ErrorOr<void> parse_prop_list(Core::Stream::BufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false)
{
Array<u8, 1024> buffer;
@@ -667,6 +729,14 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
}
}
+ bool has_case_folding { false };
+ for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) {
+ if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) {
+ data.case_folding_indices.append(i);
+ has_case_folding = true;
+ }
+ }
+
unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0;
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
@@ -675,8 +745,11 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
unicode_data.code_points_with_special_casing += has_special_casing;
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
- previous_code_point = data.code_point;
+ unicode_data.code_points_with_case_folding += has_case_folding;
+ unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.case_folding_indices.size());
+
+ previous_code_point = data.code_point;
unicode_data.code_point_data.append(move(data));
}
@@ -688,8 +761,9 @@ static ErrorOr<void> generate_unicode_data_header(Core::Stream::BufferedFile& fi
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("special_casing_mapping_size", DeprecatedString::number(unicode_data.largest_special_casing_mapping_size));
+ generator.set("case_folding_mapping_size", DeprecatedString::number(unicode_data.largest_case_folding_mapping_size));
- auto generate_enum = [&](StringView name, StringView default_, Vector<DeprecatedString> values, Vector<Alias> aliases = {}) {
+ auto generate_enum = [&](StringView name, StringView default_, auto values, Vector<Alias> aliases = {}) {
quick_sort(values);
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
@@ -737,6 +811,7 @@ namespace Unicode {
generate_enum("Locale"sv, "None"sv, unicode_data.locales);
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
+ generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses));
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
@@ -763,6 +838,14 @@ struct SpecialCasing {
Condition condition { Condition::None };
};
+struct CaseFolding {
+ u32 code_point { 0 };
+ CaseFoldingStatus status { CaseFoldingStatus::Common };
+
+ u32 mapping[@case_folding_mapping_size@];
+ u32 mapping_size { 0 };
+};
+
struct CodePointDecompositionRaw {
u32 code_point { 0 };
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
@@ -779,6 +862,7 @@ struct CodePointDecomposition {
Optional<Locale> locale_from_string(StringView locale);
Span<SpecialCasing const* const> special_case_mapping(u32 code_point);
+Span<CaseFolding const* const> case_folding_mapping(u32 code_point);
}
)~~~");
@@ -795,6 +879,8 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
generator.set("largest_special_casing_size", DeprecatedString::number(unicode_data.largest_special_casing_size));
generator.set("special_casing_size", DeprecatedString::number(unicode_data.special_casing.size()));
+ generator.set("largest_case_folding_size", DeprecatedString::number(unicode_data.largest_case_folding_size));
+ generator.set("case_folding_size", DeprecatedString::number(unicode_data.case_folding.size()));
generator.append(R"~~~(
#include <AK/Array.h>
@@ -830,7 +916,7 @@ namespace Unicode {
};
generator.append(R"~~~(
-static constexpr Array<SpecialCasing, @special_casing_size@> s_special_casing { {)~~~");
+static constexpr Array<SpecialCasing, @special_casing_size@> s_special_case { {)~~~");
for (auto const& casing : unicode_data.special_casing) {
generator.set("code_point", DeprecatedString::formatted("{:#x}", casing.code_point));
@@ -854,6 +940,21 @@ static constexpr Array<SpecialCasing, @special_casing_size@> s_special_casing {
generator.append(R"~~~(
} };
+static constexpr Array<CaseFolding, @case_folding_size@> s_case_folding { {)~~~");
+
+ for (auto const& folding : unicode_data.case_folding) {
+ generator.set("code_point", DeprecatedString::formatted("{:#x}", folding.code_point));
+ generator.set("status", folding.status);
+ generator.append(R"~~~(
+ { @code_point@, CaseFoldingStatus::@status@)~~~");
+
+ append_list_and_size(folding.mapping, "0x{:x}"sv);
+ generator.append(" },");
+ }
+
+ generator.append(R"~~~(
+} };
+
struct CodePointMapping {
u32 code_point { 0 };
u32 mapping { 0 };
@@ -865,6 +966,12 @@ struct SpecialCaseMapping {
u32 special_casing_size { 0 };
};
+struct CaseFoldingMapping {
+ u32 code_point { 0 };
+ Array<CaseFolding const*, @largest_case_folding_size@> case_folding {};
+ u32 case_folding_size { 0 };
+};
+
struct CodePointAbbreviation {
u32 code_point { 0 };
@string_index_type@ abbreviation { 0 };
@@ -953,7 +1060,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
generator.set("size", DeprecatedString::number(mapping->decomposition_size));
generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },");
} else {
- append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
+ append_list_and_size(mapping, "&s_@name@[{}]"sv);
generator.append(" },");
}
@@ -977,6 +1084,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; });
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
+ append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.case_folding_indices; });
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping,
@@ -1148,6 +1256,15 @@ Span<SpecialCasing const* const> special_case_mapping(u32 code_point)
return mapping->special_casing.span().slice(0, mapping->special_casing_size);
}
+Span<CaseFolding const* const> case_folding_mapping(u32 code_point)
+{
+ auto const* mapping = binary_search(s_case_folding_mappings, code_point, nullptr, CodePointComparator<CaseFoldingMapping> {});
+ if (mapping == nullptr)
+ return {};
+
+ return mapping->case_folding.span().slice(0, mapping->case_folding_size);
+}
+
Optional<StringView> code_point_abbreviation(u32 code_point)
{
auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
@@ -1373,6 +1490,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
StringView generated_implementation_path;
StringView unicode_data_path;
StringView special_casing_path;
+ StringView case_folding_path;
StringView derived_general_category_path;
StringView prop_list_path;
StringView derived_core_prop_path;
@@ -1394,6 +1512,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
+ args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path");
args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
@@ -1416,6 +1535,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto unicode_data_file = TRY(open_file(unicode_data_path, Core::Stream::OpenMode::Read));
auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::Stream::OpenMode::Read));
auto special_casing_file = TRY(open_file(special_casing_path, Core::Stream::OpenMode::Read));
+ auto case_folding_file = TRY(open_file(case_folding_path, Core::Stream::OpenMode::Read));
auto prop_list_file = TRY(open_file(prop_list_path, Core::Stream::OpenMode::Read));
auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::Stream::OpenMode::Read));
auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::Stream::OpenMode::Read));
@@ -1433,6 +1553,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
UnicodeData unicode_data {};
TRY(parse_special_casing(*special_casing_file, unicode_data));
+ TRY(parse_case_folding(*case_folding_file, unicode_data));
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list));
TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list));
diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
index 971461ba8e..cc13963e27 100644
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@@ -97,6 +97,33 @@ TEST_CASE(to_unicode_titlecase)
EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv);
}
+TEST_CASE(to_unicode_casefold)
+{
+ for (u8 code_point = 0; code_point < 0x80; ++code_point) {
+ auto ascii = tolower(code_point);
+ auto unicode = MUST(Unicode::to_unicode_casefold_full({ reinterpret_cast<char const*>(&code_point), 1 }));
+
+ EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u);
+ EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii);
+ }
+
+ // LATIN SMALL LETTER SHARP S
+ auto result = MUST(Unicode::to_unicode_casefold_full("\u00DF"sv));
+ EXPECT_EQ(result, "\u0073\u0073"sv);
+
+ // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+ result = MUST(Unicode::to_unicode_casefold_full("\u1FB3"sv));
+ EXPECT_EQ(result, "\u03B1\u03B9"sv);
+
+ // GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+ result = MUST(Unicode::to_unicode_casefold_full("\u1FB6"sv));
+ EXPECT_EQ(result, "\u03B1\u0342"sv);
+
+ // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+ result = MUST(Unicode::to_unicode_casefold_full("\u1FB7"sv));
+ EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv);
+}
+
TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
index 50ded775c1..a38609399d 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -61,6 +61,13 @@ ErrorOr<String> to_unicode_titlecase_full(StringView string, Optional<StringView
return builder.to_string();
}
+ErrorOr<String> to_unicode_casefold_full(StringView string)
+{
+ StringBuilder builder;
+ TRY(Detail::build_casefold_string(Utf8View { string }, builder));
+ return builder.to_string();
+}
+
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h
index 43f3c8f6e9..1976d614e9 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -44,6 +44,7 @@ u32 to_unicode_titlecase(u32 code_point);
ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView, Optional<StringView> const& locale = {});
ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringView> const& locale = {});
ErrorOr<String> to_unicode_titlecase_full(StringView, Optional<StringView> const& locale = {});
+ErrorOr<String> to_unicode_casefold_full(StringView);
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
index 8af88ea244..fbd3a8a164 100644
--- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
+++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
@@ -195,6 +195,19 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View
return nullptr;
}
+template<CaseFoldingStatus... StatusFilter>
+static CaseFolding const* find_matching_case_folding(u32 code_point)
+{
+ auto case_foldings = case_folding_mapping(code_point);
+
+ for (auto const* case_folding : case_foldings) {
+ if (((case_folding->status == StatusFilter) || ...))
+ return case_folding;
+ }
+
+ return nullptr;
+}
+
#endif
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
@@ -314,4 +327,32 @@ ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[ma
#endif
}
+// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
+ErrorOr<void> build_casefold_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder)
+{
+#if ENABLE_UNICODE_DATA
+ // toCasefold(X): Map each character C in X to Case_Folding(C).
+ //
+ // Case_Folding(C) uses the mappings with the status field value ā€œCā€ or ā€œFā€ in the data file
+ // CaseFolding.txt in the Unicode Character Database.
+
+ using enum CaseFoldingStatus;
+
+ for (auto code_point : code_points) {
+ auto const* case_folding = find_matching_case_folding<Common, Full>(code_point);
+ if (!case_folding) {
+ TRY(builder.try_append_code_point(code_point));
+ continue;
+ }
+
+ for (size_t i = 0; i < case_folding->mapping_size; ++i)
+ TRY(builder.try_append_code_point(case_folding->mapping[i]));
+ }
+
+ return {};
+#else
+ return Error::from_string_literal("Unicode data has been disabled");
+#endif
+}
+
}
diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h
index 5e9bcbf2a7..af7702abbc 100644
--- a/Userland/Libraries/LibUnicode/UnicodeUtils.h
+++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h
@@ -17,5 +17,6 @@ namespace Unicode::Detail {
ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
+ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder);
}