diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-24 22:15:38 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-08-26 22:04:09 +0100 |
commit | b7a95cba65988ab34a3f859151d97af24949195d (patch) | |
tree | 3c7fc6a9626f3bba2d2f9460cf0e53738478a28e | |
parent | 3127454642dbc28ec2e2930b75524276f3386133 (diff) | |
download | serenity-b7a95cba65988ab34a3f859151d97af24949195d.zip |
LibUnicode: Implement grammar validators for Unicode TR-35
ECMA-402 requires validating user input against the EBNF grammar for
Unicode locales described in TR-35: https://www.unicode.org/reports/tr35
This commit adds validators for that grammar, as well as other helper to
e.g. canonicalize a locale string.
-rw-r--r-- | Tests/LibUnicode/TestUnicodeLocale.cpp | 120 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Locale.cpp | 152 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Locale.h | 40 |
4 files changed, 313 insertions, 0 deletions
diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp new file mode 100644 index 0000000000..95ae10cb4a --- /dev/null +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <LibTest/TestCase.h> + +#include <LibUnicode/Locale.h> + +TEST_CASE(is_unicode_language_subtag) +{ + EXPECT(Unicode::is_unicode_language_subtag("aa"sv)); + EXPECT(Unicode::is_unicode_language_subtag("aaa"sv)); + EXPECT(Unicode::is_unicode_language_subtag("aaaaa"sv)); + EXPECT(Unicode::is_unicode_language_subtag("aaaaaa"sv)); + EXPECT(Unicode::is_unicode_language_subtag("aaaaaaa"sv)); + EXPECT(Unicode::is_unicode_language_subtag("aaaaaaaa"sv)); + + EXPECT(!Unicode::is_unicode_language_subtag(""sv)); + EXPECT(!Unicode::is_unicode_language_subtag("a"sv)); + EXPECT(!Unicode::is_unicode_language_subtag("aaaa"sv)); + EXPECT(!Unicode::is_unicode_language_subtag("aaaaaaaaa"sv)); + EXPECT(!Unicode::is_unicode_language_subtag("123"sv)); +} + +TEST_CASE(is_unicode_script_subtag) +{ + EXPECT(Unicode::is_unicode_script_subtag("aaaa"sv)); + + EXPECT(!Unicode::is_unicode_script_subtag(""sv)); + EXPECT(!Unicode::is_unicode_script_subtag("a"sv)); + EXPECT(!Unicode::is_unicode_script_subtag("aa"sv)); + EXPECT(!Unicode::is_unicode_script_subtag("aaa"sv)); + EXPECT(!Unicode::is_unicode_script_subtag("aaaaa"sv)); + EXPECT(!Unicode::is_unicode_script_subtag("1234"sv)); +} + +TEST_CASE(is_unicode_region_subtag) +{ + EXPECT(Unicode::is_unicode_region_subtag("aa"sv)); + EXPECT(Unicode::is_unicode_region_subtag("123"sv)); + + EXPECT(!Unicode::is_unicode_region_subtag(""sv)); + EXPECT(!Unicode::is_unicode_region_subtag("a"sv)); + EXPECT(!Unicode::is_unicode_region_subtag("aaa"sv)); + EXPECT(!Unicode::is_unicode_region_subtag("12"sv)); + EXPECT(!Unicode::is_unicode_region_subtag("12a"sv)); +} + +TEST_CASE(is_unicode_variant_subtag) +{ + EXPECT(Unicode::is_unicode_variant_subtag("aaaaa"sv)); + EXPECT(Unicode::is_unicode_variant_subtag("aaaaaa"sv)); + EXPECT(Unicode::is_unicode_variant_subtag("aaaaaaa"sv)); + EXPECT(Unicode::is_unicode_variant_subtag("aaaaaaaa"sv)); + + EXPECT(Unicode::is_unicode_variant_subtag("1aaa"sv)); + EXPECT(Unicode::is_unicode_variant_subtag("12aa"sv)); + EXPECT(Unicode::is_unicode_variant_subtag("123a"sv)); + EXPECT(Unicode::is_unicode_variant_subtag("1234"sv)); + + EXPECT(!Unicode::is_unicode_variant_subtag(""sv)); + EXPECT(!Unicode::is_unicode_variant_subtag("a"sv)); + EXPECT(!Unicode::is_unicode_variant_subtag("aa"sv)); + EXPECT(!Unicode::is_unicode_variant_subtag("aaa"sv)); + EXPECT(!Unicode::is_unicode_variant_subtag("aaaa"sv)); + EXPECT(!Unicode::is_unicode_variant_subtag("aaaaaaaaa"sv)); + EXPECT(!Unicode::is_unicode_variant_subtag("a234"sv)); +} + +TEST_CASE(parse_unicode_locale_id) +{ + auto fail = [](StringView locale) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + EXPECT(!locale_id.has_value()); + }; + auto pass = [](StringView locale, Optional<StringView> expected_language, Optional<StringView> expected_script, Optional<StringView> expected_region, Vector<StringView> expected_variants) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + VERIFY(locale_id.has_value()); + + EXPECT_EQ(locale_id->language_id.language, expected_language); + EXPECT_EQ(locale_id->language_id.script, expected_script); + EXPECT_EQ(locale_id->language_id.region, expected_region); + EXPECT_EQ(locale_id->language_id.variants, expected_variants); + }; + + fail("a"sv); + fail("1234"sv); + fail("aaa-"sv); + fail("aaa-cc-"sv); + fail("aaa-bbbb-cc-"sv); + fail("aaa-bbbb-cc-123"sv); + + pass("aaa"sv, "aaa"sv, {}, {}, {}); + pass("aaa-bbbb"sv, "aaa"sv, "bbbb"sv, {}, {}); + pass("aaa-cc"sv, "aaa"sv, {}, "cc"sv, {}); + pass("aaa-bbbb-cc"sv, "aaa"sv, "bbbb"sv, "cc"sv, {}); + pass("aaa-bbbb-cc-1234"sv, "aaa"sv, "bbbb"sv, "cc"sv, { "1234"sv }); + pass("aaa-bbbb-cc-1234-5678"sv, "aaa"sv, "bbbb"sv, "cc"sv, { "1234"sv, "5678"sv }); +} + +TEST_CASE(canonicalize_unicode_locale_id) +{ + auto test = [](StringView locale, StringView expected_canonical_locale) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + VERIFY(locale_id.has_value()); + + auto canonical_locale = Unicode::canonicalize_unicode_locale_id(*locale_id); + EXPECT_EQ(canonical_locale, expected_canonical_locale); + }; + + test("aaa"sv, "aaa"sv); + test("AaA"sv, "aaa"sv); + test("aaa-bbbb"sv, "aaa-Bbbb"sv); + test("aaa-cc"sv, "aaa-CC"sv); + test("aaa-bBBB-cC"sv, "aaa-Bbbb-CC"sv); + test("aaa-bbbb-cc-1234"sv, "aaa-Bbbb-CC-1234"sv); + test("aaa-bbbb-cc-ABCDE"sv, "aaa-Bbbb-CC-abcde"sv); +} diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index f0875b7de8..c7ded76f1e 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -3,6 +3,7 @@ include(unicode_data.cmake) SET(SOURCES ${UNICODE_DATA_SOURCES} CharacterTypes.cpp + Locale.cpp ) serenity_lib(LibUnicode unicode) diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp new file mode 100644 index 0000000000..4f796c053b --- /dev/null +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/AllOf.h> +#include <AK/CharacterTypes.h> +#include <AK/GenericLexer.h> +#include <AK/QuickSort.h> +#include <AK/StringBuilder.h> +#include <LibUnicode/Locale.h> + +namespace Unicode { + +bool is_unicode_language_subtag(StringView subtag) +{ + // unicode_language_subtag = alpha{2,3} | alpha{5,8} + if ((subtag.length() < 2) || (subtag.length() == 4) || (subtag.length() > 8)) + return false; + return all_of(subtag, is_ascii_alpha); +} + +bool is_unicode_script_subtag(StringView subtag) +{ + // unicode_script_subtag = alpha{4} + if (subtag.length() != 4) + return false; + return all_of(subtag, is_ascii_alpha); +} + +bool is_unicode_region_subtag(StringView subtag) +{ + // unicode_region_subtag = (alpha{2} | digit{3}) + if (subtag.length() == 2) + return all_of(subtag, is_ascii_alpha); + if (subtag.length() == 3) + return all_of(subtag, is_ascii_digit); + return false; +} + +bool is_unicode_variant_subtag(StringView subtag) +{ + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) + if ((subtag.length() >= 5) && (subtag.length() <= 8)) + return all_of(subtag, is_ascii_alphanumeric); + if (subtag.length() == 4) + return is_ascii_digit(subtag[0]) && all_of(subtag.substring_view(1), is_ascii_alphanumeric); + return false; +} + +Optional<LanguageID> parse_unicode_language_id(StringView language) +{ + // https://unicode.org/reports/tr35/#Unicode_language_identifier + // + // unicode_language_id = "root" + // OR + // unicode_language_id = ((unicode_language_subtag (sep unicode_script_subtag)?) | unicode_script_subtag) + // (sep unicode_region_subtag)? + // (sep unicode_variant_subtag)* + LanguageID language_id {}; + + if (language == "root"sv) { + language_id.is_root = true; + return language_id; + } + + auto segments = language.split_view_if(is_any_of("-_"sv), true); // keep_empty=true to ensure valid data follows a separator. + size_t index = 0; + + if (segments.size() == index) + return {}; + + if (is_unicode_language_subtag(segments[index])) { + language_id.language = segments[index]; + if (segments.size() == ++index) + return language_id; + } + + if (is_unicode_script_subtag(segments[index])) { + language_id.script = segments[index]; + if (segments.size() == ++index) + return language_id; + } else if (!language_id.language.has_value()) { + return {}; + } + + if (is_unicode_region_subtag(segments[index])) { + language_id.region = segments[index]; + if (segments.size() == ++index) + return language_id; + } + + while (index < segments.size()) { + if (!is_unicode_variant_subtag(segments[index])) + return {}; + language_id.variants.append(segments[index++]); + } + + return language_id; +} + +Optional<LocaleID> parse_unicode_locale_id(StringView locale) +{ + LocaleID locale_id {}; + + // https://unicode.org/reports/tr35/#Unicode_locale_identifier + // + // unicode_locale_id = unicode_language_id + // extensions* + // pu_extensions? + auto language_id = parse_unicode_language_id(locale); + if (!language_id.has_value()) + return {}; + + // FIXME: Handle extensions and pu_extensions. + return LocaleID { language_id.release_value() }; +} + +Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id) +{ + // https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers + StringBuilder builder; + + if (!locale_id.language_id.language.has_value()) + return {}; + + builder.append(locale_id.language_id.language->to_lowercase_string()); + + if (locale_id.language_id.script.has_value()) { + builder.append('-'); + builder.append(locale_id.language_id.script->to_titlecase_string()); + } + + if (locale_id.language_id.region.has_value()) { + builder.append('-'); + builder.append(locale_id.language_id.region->to_uppercase_string()); + } + + quick_sort(locale_id.language_id.variants); + + for (auto const& variant : locale_id.language_id.variants) { + builder.append('-'); + builder.append(variant.to_lowercase_string()); + } + + // FIXME: Handle extensions and pu_extensions. + + return builder.build(); +} + +} diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h new file mode 100644 index 0000000000..45af6fa4a2 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/Optional.h> +#include <AK/String.h> +#include <AK/StringView.h> +#include <AK/Vector.h> +#include <LibUnicode/Forward.h> + +namespace Unicode { + +struct LanguageID { + bool is_root { false }; + Optional<StringView> language {}; + Optional<StringView> script {}; + Optional<StringView> region {}; + Vector<StringView> variants {}; +}; + +struct LocaleID { + LanguageID language_id {}; +}; + +// Note: These methods only verify that the provided strings match the EBNF grammar of the +// Unicode identifier subtag (i.e. no validation is done that the tags actually exist). +bool is_unicode_language_subtag(StringView); +bool is_unicode_script_subtag(StringView); +bool is_unicode_region_subtag(StringView); +bool is_unicode_variant_subtag(StringView); + +Optional<LanguageID> parse_unicode_language_id(StringView); +Optional<LocaleID> parse_unicode_locale_id(StringView); +Optional<String> canonicalize_unicode_locale_id(LocaleID&); + +} |