diff options
-rw-r--r-- | Tests/LibUnicode/TestUnicodeLocale.cpp | 24 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Locale.cpp | 41 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Locale.h | 1 |
3 files changed, 65 insertions, 1 deletions
diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index dc0aa45974..80b7ee8ba1 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -246,6 +246,30 @@ TEST_CASE(parse_unicode_locale_id_with_other_extension) pass("en-z-aa-bbb-cccccccc", { 'z', { "aa"sv, "bbb"sv, "cccccccc"sv } }); } +TEST_CASE(parse_unicode_locale_id_with_private_use_extension) +{ + auto fail = [](StringView locale) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + EXPECT(!locale_id.has_value()); + }; + auto pass = [](StringView locale, Vector<StringView> const& expected_extension) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + VERIFY(locale_id.has_value()); + EXPECT_EQ(locale_id->private_use_extensions, expected_extension); + }; + + fail("en-x"sv); + fail("en-x-"sv); + fail("en-x-aaaaaaaaa"sv); + fail("en-x-aaa-"sv); + fail("en-x-aaa-aaaaaaaaa"sv); + + pass("en-x-a", { "a"sv }); + pass("en-x-aaaaaaaa", { "aaaaaaaa"sv }); + pass("en-x-aaa-bbb", { "aaa"sv, "bbb"sv }); + pass("en-x-aaa-x-bbb", { "aaa"sv, "x"sv, "bbb"sv }); +} + TEST_CASE(canonicalize_unicode_locale_id) { auto test = [](StringView locale, StringView expected_canonical_locale) { diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index d886c4a812..9dda3c2bba 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -400,6 +400,45 @@ static Optional<Extension> parse_extension(GenericLexer& lexer) return {}; } +static Vector<StringView> parse_private_use_extensions(GenericLexer& lexer) +{ + // https://unicode.org/reports/tr35/#pu_extensions + // + // pu_extensions = = sep [xX] (sep alphanum{1,8})+ ; + size_t starting_position = lexer.tell(); + + auto header = consume_next_segment(lexer); + if (!header.has_value()) + return {}; + + auto parse_values = [&]() -> Vector<StringView> { + Vector<StringView> extensions; + + while (true) { + auto segment = consume_next_segment(lexer); + if (!segment.has_value()) + break; + + if ((segment->length() < 1) || (segment->length() > 8) || !all_of(*segment, is_ascii_alphanumeric)) { + lexer.retreat(segment->length() + 1); + break; + } + + extensions.append(*segment); + } + + return extensions; + }; + + if ((header->length() == 1) && (((*header)[0] == 'x') || ((*header)[0] == 'X'))) { + if (auto extensions = parse_values(); !extensions.is_empty()) + return extensions; + } + + lexer.retreat(lexer.tell() - starting_position); + return {}; +} + Optional<LanguageID> parse_unicode_language_id(StringView language) { GenericLexer lexer { language }; @@ -433,7 +472,7 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale) locale_id.extensions.append(extension.release_value()); } - // FIXME: Handle pu_extensions. + locale_id.private_use_extensions = parse_private_use_extensions(lexer); if (!lexer.is_eof()) return {}; diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index b8fc43d1e4..5fc244e5ed 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -53,6 +53,7 @@ using Extension = Variant<LocaleExtension, TransformedExtension, OtherExtension> struct LocaleID { LanguageID language_id {}; Vector<Extension> extensions {}; + Vector<StringView> private_use_extensions {}; }; // Note: These methods only verify that the provided strings match the EBNF grammar of the |