diff options
-rw-r--r-- | Tests/LibUnicode/TestUnicodeCharacterTypes.cpp | 31 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/CharacterTypes.cpp | 31 |
2 files changed, 62 insertions, 0 deletions
diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index eff0b89286..18cff2bdcb 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -257,6 +257,37 @@ TEST_CASE(to_unicode_uppercase_unconditional_special_casing) EXPECT_EQ(result, "\u03A9\u0342\u0399"); } +TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted) +{ + // LATIN SMALL LETTER I + auto result = Unicode::to_unicode_uppercase_full("i"sv, "en"sv); + EXPECT_EQ(result, "I"sv); + + result = Unicode::to_unicode_uppercase_full("i"sv, "lt"sv); + EXPECT_EQ(result, "I"sv); + + // LATIN SMALL LETTER J + result = Unicode::to_unicode_uppercase_full("j"sv, "en"sv); + EXPECT_EQ(result, "J"sv); + + result = Unicode::to_unicode_uppercase_full("j"sv, "lt"sv); + EXPECT_EQ(result, "J"sv); + + // LATIN SMALL LETTER I followed by COMBINING DOT ABOVE + result = Unicode::to_unicode_uppercase_full("i\u0307"sv, "en"sv); + EXPECT_EQ(result, "I\u0307"sv); + + result = Unicode::to_unicode_uppercase_full("i\u0307"sv, "lt"sv); + EXPECT_EQ(result, "I"sv); + + // LATIN SMALL LETTER J followed by COMBINING DOT ABOVE + result = Unicode::to_unicode_uppercase_full("j\u0307"sv, "en"sv); + EXPECT_EQ(result, "J\u0307"sv); + + result = Unicode::to_unicode_uppercase_full("j\u0307"sv, "lt"sv); + EXPECT_EQ(result, "J"sv); +} + TEST_CASE(general_category) { auto general_category = [](StringView name) { diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index b566416c08..b6f3555632 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -49,6 +49,32 @@ static bool is_after_uppercase_i(Utf8View const& string, size_t index) return found_uppercase_i; } +static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index) +{ + // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above). + auto preceding_view = string.substring_view(0, index); + bool found_soft_dotted_code_point = false; + + // FIXME: Would be better if Utf8View supported reverse iteration. + for (auto code_point : preceding_view) { + if (code_point_has_property(code_point, Property::Soft_Dotted)) { + found_soft_dotted_code_point = true; + continue; + } + + auto unicode_data = Detail::unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) + return false; + + if (unicode_data->canonical_combining_class == 0) + found_soft_dotted_code_point = false; + else if (unicode_data->canonical_combining_class == 230) + found_soft_dotted_code_point = false; + } + + return found_soft_dotted_code_point; +} + static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) { // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable @@ -113,6 +139,11 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O return special_casing; break; + case Condition::AfterSoftDotted: + if (is_after_soft_dotted_code_point(string, index)) + return special_casing; + break; + case Condition::FinalSigma: if (is_final_code_point(string, index, byte_length)) return special_casing; |