diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-11-17 09:56:16 -0500 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-11-19 11:45:35 +0100 |
commit | 93ee9220270c86f63c5629ffe974f8eccb1c414d (patch) | |
tree | b93738866d4ab7d1053e937c83a6eca9c98f7925 | |
parent | 4b535ce1c83696c8f856053c84921323c2e4a040 (diff) | |
download | serenity-93ee9220270c86f63c5629ffe974f8eccb1c414d.zip |
LibUnicode: Support locales-without-script aliases for ECMA-402
As noted by ECMA-402, if a supported locale contains all of a language,
script, and region subtag, then the implementation must also support the
locale without the script subtag. The most complicated example of this
is the zh-TW locale.
The list of locales in the CLDR database does not include zh-TW or its
maximized zh-Hant-TW variant. Instead, it inlcudes the zh-Hant locale.
However, zh-Hant-TW is listed in the default-content locale list in the
cldr-core package. This defines an alias from zh-Hant-TW to zh-Hant. We
must then also support the zh-Hant-TW alias without the script subtag:
zh-TW. This transitively maps zh-TW to zh-Hant, which is a case quite
heavily tested by test262.
-rw-r--r-- | Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp | 43 | ||||
-rw-r--r-- | Tests/LibUnicode/TestUnicodeLocale.cpp | 8 |
2 files changed, 51 insertions, 0 deletions
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp index 78e64a861c..f8cf25ec5a 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp @@ -427,6 +427,48 @@ static void parse_default_content_locales(String core_path, UnicodeLocaleData& l }); } +static void define_aliases_without_scripts(UnicodeLocaleData& locale_data) +{ + // From ECMA-402: https://tc39.es/ecma402/#sec-internal-slots + // + // For locales that include a script subtag in addition to language and region, the + // corresponding locale without a script subtag must also be supported. + // + // So we define aliases for locales that contain all three subtags, but we must also take + // care to handle when the locale itself or the locale without a script subtag are an alias + // by way of default-content locales. + auto find_alias = [&](auto const& locale) { + return locale_data.locale_aliases.find_if([&](auto const& alias) { return locale == alias.alias; }); + }; + + auto append_alias_without_script = [&](auto const& locale) { + auto parsed_locale = CanonicalLanguageID<StringIndexType>::parse(locale_data.unique_strings, locale); + VERIFY(parsed_locale.has_value()); + + if ((parsed_locale->language == 0) || (parsed_locale->script == 0) || (parsed_locale->region == 0)) + return; + + auto locale_without_script = String::formatted("{}-{}", + locale_data.unique_strings.get(parsed_locale->language), + locale_data.unique_strings.get(parsed_locale->region)); + + if (locale_data.locales.contains(locale_without_script)) + return; + if (find_alias(locale_without_script) != locale_data.locale_aliases.end()) + return; + + if (auto it = find_alias(locale); it != locale_data.locale_aliases.end()) + locale_data.locale_aliases.append({ it->name, locale_without_script }); + else + locale_data.locale_aliases.append({ locale, locale_without_script }); + }; + + for (auto const& locale : locale_data.locales) + append_alias_without_script(locale.key); + for (auto const& locale : locale_data.locale_aliases) + append_alias_without_script(locale.alias); +} + static void parse_all_locales(String core_path, String locale_names_path, String misc_path, String numbers_path, UnicodeLocaleData& locale_data) { auto identity_iterator = path_to_dir_iterator(locale_names_path); @@ -508,6 +550,7 @@ static void parse_all_locales(String core_path, String locale_names_path, String } parse_default_content_locales(move(core_path), locale_data); + define_aliases_without_scripts(locale_data); } static String format_identifier(StringView owner, String identifier) diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index 34619c8cd7..9f8064212a 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -463,3 +463,11 @@ TEST_CASE(canonicalize_unicode_locale_id) test("zh-Hans-CN"sv, "zh-Hans-CN"sv); test("ZH-HANS-CN"sv, "zh-Hans-CN"sv); } + +TEST_CASE(supports_locale_aliases) +{ + EXPECT(Unicode::is_locale_available("zh"sv)); + EXPECT(Unicode::is_locale_available("zh-Hant"sv)); + EXPECT(Unicode::is_locale_available("zh-TW"sv)); + EXPECT(Unicode::is_locale_available("zh-Hant-TW"sv)); +} |