diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-10 07:17:24 -0400 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-08-11 13:11:01 +0200 |
commit | 7dce2bfe233de92a8efd0c12bc746adddbb557a0 (patch) | |
tree | 325bab8265714a30358c10a87673f8f07c38f2ae /Tests/LibUnicode | |
parent | 4e546cee97df984f5327922c312520e36c1d9688 (diff) | |
download | serenity-7dce2bfe233de92a8efd0c12bc746adddbb557a0.zip |
LibUnicode: Generate separate tables for General Category properties
Previously, each code point's General Category was part of the generated
UnicodeData structure. This ultimately presented two problems, one
functional and one performance related:
* Some General Categories are applied to unassigned code points, for
example the Unassigned (Cn) category. Unassigned code points are
strictly excluded from UnicodeData.txt, so by relying on that file,
the generator is unable to handle these categories.
* Lookups for General Categories are slower when searching through the
large UnicodeData hash map. Even though lookups are O(1), the hash
function turned out to be slower than binary searching through a
category-specific table.
So, now a table is generated for each General Category. When querying a
code point for a category, a binary search is done on each code point
range in that category's table to check if code point has that category.
Further, General Categories are now parsed from the UCD file
DerivedGeneralCategory.txt. This file is a normal "prop list" file and
contains the categories for unassigned code points.
Diffstat (limited to 'Tests/LibUnicode')
-rw-r--r-- | Tests/LibUnicode/TestUnicodeCharacterTypes.cpp | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index a71fd0d9f3..481855fd06 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -213,3 +213,76 @@ TEST_CASE(to_unicode_uppercase_unconditional_special_casing) result = Unicode::to_unicode_uppercase_full("\u1FF7"sv); EXPECT_EQ(result, "\u03A9\u0342\u0399"); } + +TEST_CASE(general_category) +{ + auto general_category = [](StringView name) { + auto general_category = Unicode::general_category_from_string(name); + VERIFY(general_category.has_value()); + return *general_category; + }; + + auto general_category_c = general_category("C"sv); + auto general_category_other = general_category("Other"sv); + EXPECT_EQ(general_category_c, general_category_other); + + auto general_category_cc = general_category("Cc"sv); + auto general_category_control = general_category("Control"sv); + EXPECT_EQ(general_category_cc, general_category_control); + + auto general_category_co = general_category("Co"sv); + auto general_category_private_use = general_category("Private_Use"sv); + EXPECT_EQ(general_category_co, general_category_private_use); + + auto general_category_lc = general_category("LC"sv); + auto general_category_cased_letter = general_category("Cased_Letter"sv); + EXPECT_EQ(general_category_lc, general_category_cased_letter); + + auto general_category_ll = general_category("Ll"sv); + auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv); + EXPECT_EQ(general_category_ll, general_category_lowercase_letter); + + auto general_category_lu = general_category("Lu"sv); + auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv); + EXPECT_EQ(general_category_lu, general_category_uppercase_letter); + + for (u32 code_point = 0; code_point <= 0x1f; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); + } + + for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); + } + + for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); + } + + for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); + } +} |