summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-08-10 17:42:21 -0400
committerAndreas Kling <kling@serenityos.org>2021-08-11 13:11:01 +0200
commit1e91334008d70a7142b7810a6348a9a2f384da95 (patch)
tree7402fa412c06ea1ec3b8d663c730cde417a81b0c
parent47bb350ebde09a11a594a605d3faea4167392048 (diff)
downloadserenity-1e91334008d70a7142b7810a6348a9a2f384da95.zip
LibUnicode: Handle edge-case script extensions, Common and Inherited
These script extensions have some peculiar behavior in the Unicode spec. The UCD ScriptExtension file does not contain these scripts. Rather, it is implied the code points which have these scripts as an extension are the code points that both: 1. Have Common or Inherited as their primary script value 2. Do not have any other script value in their script extension lists Because these are not explictly listed in the UCD, we must manually form these script extensions.
-rw-r--r--Tests/LibUnicode/TestUnicodeCharacterTypes.cpp22
-rw-r--r--Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp27
2 files changed, 49 insertions, 0 deletions
diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
index 8afbbdc5b4..65aa4ca5d2 100644
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@@ -464,4 +464,26 @@ TEST_CASE(script_extension)
EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek));
EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek));
+
+ auto script_common = script("Common"sv);
+ auto script_zyyy = script("Zyyy"sv);
+ EXPECT_EQ(script_common, script_zyyy);
+
+ EXPECT(Unicode::code_point_has_script(0x202f, script_common));
+ EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common));
+
+ EXPECT(Unicode::code_point_has_script(0x3000, script_common));
+ EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common));
+
+ auto script_inherited = script("Inherited"sv);
+ auto script_qaai = script("Qaai"sv);
+ auto script_zinh = script("Zinh"sv);
+ EXPECT_EQ(script_inherited, script_qaai);
+ EXPECT_EQ(script_inherited, script_zinh);
+
+ EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited));
+ EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited));
+
+ EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
+ EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
}
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
index 0446e6cbf2..5d1c289336 100644
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -883,6 +883,33 @@ static void normalize_script_extensions(PropList& script_extensions, PropList co
sort_and_merge_code_point_ranges(code_points);
}
+
+ // Lastly, the Common and Inherited script extensions are special. They must not contain any
+ // code points which appear in other script extensions. The ScriptExtensions UCD file does not
+ // list these extensions, therefore this peculiarity must be handled programatically.
+ // https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
+ auto code_point_has_other_extension = [&](StringView key, u32 code_point) {
+ for (auto const& extension : extensions) {
+ if (extension.key == key)
+ continue;
+ if (any_of(extension.value, [&](auto const& r) { return (r.first <= code_point) && (code_point <= r.last); }))
+ return true;
+ }
+
+ return false;
+ };
+
+ auto get_code_points_without_other_extensions = [&](StringView key) {
+ auto code_points = flatten_code_point_ranges(script_list.find(key)->value);
+ code_points.remove_all_matching([&](u32 c) { return code_point_has_other_extension(key, c); });
+ return code_points;
+ };
+
+ auto common_code_points = get_code_points_without_other_extensions("Common"sv);
+ script_extensions.set("Common"sv, form_code_point_ranges(common_code_points));
+
+ auto inherited_code_points = get_code_points_without_other_extensions("Inherited"sv);
+ script_extensions.set("Inherited"sv, form_code_point_ranges(inherited_code_points));
}
int main(int argc, char** argv)