diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-27 16:38:06 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-08-30 19:42:40 +0100 |
commit | eda92d15e4c5f6eb91263695c42997b45de99c98 (patch) | |
tree | 7e0cde51b0484fcc305f07e71400bd9ff72633e4 /Userland/Libraries/LibUnicode | |
parent | dd89901b070c07cd54095931c4b03fc5beb3316d (diff) | |
download | serenity-eda92d15e4c5f6eb91263695c42997b45de99c98.zip |
LibUnicode: Parse locale extensions of the Unicode locale extension form
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r-- | Userland/Libraries/LibUnicode/Locale.cpp | 136 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Locale.h | 14 |
2 files changed, 146 insertions, 4 deletions
diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index 4ccf9e9a89..6f52ef25a8 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -53,7 +53,32 @@ bool is_unicode_variant_subtag(StringView subtag) return false; } -static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator) +static bool is_key(StringView key) +{ + // key = alphanum alpha + if (key.length() != 2) + return false; + return is_ascii_alphanumeric(key[0]) && is_ascii_alpha(key[1]); +} + +static bool is_single_type(StringView type) +{ + // type = alphanum{3,8} (sep alphanum{3,8})* + // Note: Consecutive types are not handled here, that is left to the caller. + if ((type.length() < 3) || (type.length() > 8)) + return false; + return all_of(type, is_ascii_alphanumeric); +} + +static bool is_attribute(StringView type) +{ + // attribute = alphanum{3,8} + if ((type.length() < 3) || (type.length() > 8)) + return false; + return all_of(type, is_ascii_alphanumeric); +} + +static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator = true) { constexpr auto is_separator = is_any_of("-_"sv); @@ -153,6 +178,101 @@ static Optional<LanguageID> parse_unicode_language_id(GenericLexer& lexer) return language_id; } +static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& lexer) +{ + // https://unicode.org/reports/tr35/#unicode_locale_extensions + // + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | (sep attribute)+ (sep keyword)*) + LocaleExtension locale_extension {}; + + enum class ParseState { + ParsingAttributeOrKeyword, + ParsingAttribute, + ParsingKeyword, + Done, + }; + + auto state = ParseState::ParsingAttributeOrKeyword; + + while (!lexer.is_eof() && (state != ParseState::Done)) { + auto segment = consume_next_segment(lexer); + if (!segment.has_value()) + return {}; + + if (state == ParseState::ParsingAttributeOrKeyword) + state = is_key(*segment) ? ParseState::ParsingKeyword : ParseState::ParsingAttribute; + + switch (state) { + case ParseState::ParsingAttribute: + if (is_attribute(*segment)) { + locale_extension.attributes.append(*segment); + break; + } + + state = ParseState::ParsingKeyword; + [[fallthrough]]; + + case ParseState::ParsingKeyword: { + // keyword = key (sep type)? + Keyword keyword { .key = *segment }; + + if (!is_key(*segment)) { + lexer.retreat(segment->length() + 1); + state = ParseState::Done; + break; + } + + while (true) { + auto type = consume_next_segment(lexer); + + if (!type.has_value() || !is_single_type(*type)) { + if (type.has_value()) + lexer.retreat(type->length() + 1); + break; + } + + keyword.types.append(*type); + } + + locale_extension.keywords.append(move(keyword)); + break; + } + + default: + VERIFY_NOT_REACHED(); + } + } + + if (locale_extension.attributes.is_empty() && locale_extension.keywords.is_empty()) + return {}; + return locale_extension; +} + +static Optional<Extension> parse_extension(GenericLexer& lexer) +{ + // https://unicode.org/reports/tr35/#extensions + // + // extensions = unicode_locale_extensions | transformed_extensions | other_extensions + size_t starting_position = lexer.tell(); + + if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) { + switch ((*header)[0]) { + case 'u': + case 'U': + if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value()) + return Extension { extension.release_value() }; + break; + + default: + // FIXME: Handle transformed_extensions / other_extensions + break; + } + } + + lexer.retreat(lexer.tell() - starting_position); + return {}; +} + Optional<LanguageID> parse_unicode_language_id(StringView language) { GenericLexer lexer { language }; @@ -167,7 +287,6 @@ Optional<LanguageID> parse_unicode_language_id(StringView language) Optional<LocaleID> parse_unicode_locale_id(StringView locale) { GenericLexer lexer { locale }; - LocaleID locale_id {}; // https://unicode.org/reports/tr35/#Unicode_locale_identifier // @@ -178,12 +297,21 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale) if (!language_id.has_value()) return {}; - // FIXME: Handle extensions and pu_extensions. + LocaleID locale_id { language_id.release_value() }; + + while (true) { + auto extension = parse_extension(lexer); + if (!extension.has_value()) + break; + locale_id.extensions.append(extension.release_value()); + } + + // FIXME: Handle pu_extensions. if (!lexer.is_eof()) return {}; - return LocaleID { language_id.release_value() }; + return locale_id; } Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id) diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index d86cb97d6f..742970a125 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -9,6 +9,7 @@ #include <AK/Optional.h> #include <AK/String.h> #include <AK/StringView.h> +#include <AK/Variant.h> #include <AK/Vector.h> #include <LibUnicode/Forward.h> @@ -22,8 +23,21 @@ struct LanguageID { Vector<StringView> variants {}; }; +struct Keyword { + StringView key {}; + Vector<StringView> types {}; +}; + +struct LocaleExtension { + Vector<StringView> attributes {}; + Vector<Keyword> keywords {}; +}; + +using Extension = Variant<LocaleExtension>; + struct LocaleID { LanguageID language_id {}; + Vector<Extension> extensions {}; }; // Note: These methods only verify that the provided strings match the EBNF grammar of the |