summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibUnicode/Locale.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'Userland/Libraries/LibUnicode/Locale.cpp')
-rw-r--r--Userland/Libraries/LibUnicode/Locale.cpp136
1 files changed, 132 insertions, 4 deletions
diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp
index 4ccf9e9a89..6f52ef25a8 100644
--- a/Userland/Libraries/LibUnicode/Locale.cpp
+++ b/Userland/Libraries/LibUnicode/Locale.cpp
@@ -53,7 +53,32 @@ bool is_unicode_variant_subtag(StringView subtag)
return false;
}
-static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator)
+static bool is_key(StringView key)
+{
+ // key = alphanum alpha
+ if (key.length() != 2)
+ return false;
+ return is_ascii_alphanumeric(key[0]) && is_ascii_alpha(key[1]);
+}
+
+static bool is_single_type(StringView type)
+{
+ // type = alphanum{3,8} (sep alphanum{3,8})*
+ // Note: Consecutive types are not handled here, that is left to the caller.
+ if ((type.length() < 3) || (type.length() > 8))
+ return false;
+ return all_of(type, is_ascii_alphanumeric);
+}
+
+static bool is_attribute(StringView type)
+{
+ // attribute = alphanum{3,8}
+ if ((type.length() < 3) || (type.length() > 8))
+ return false;
+ return all_of(type, is_ascii_alphanumeric);
+}
+
+static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator = true)
{
constexpr auto is_separator = is_any_of("-_"sv);
@@ -153,6 +178,101 @@ static Optional<LanguageID> parse_unicode_language_id(GenericLexer& lexer)
return language_id;
}
+static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& lexer)
+{
+ // https://unicode.org/reports/tr35/#unicode_locale_extensions
+ //
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ | (sep attribute)+ (sep keyword)*)
+ LocaleExtension locale_extension {};
+
+ enum class ParseState {
+ ParsingAttributeOrKeyword,
+ ParsingAttribute,
+ ParsingKeyword,
+ Done,
+ };
+
+ auto state = ParseState::ParsingAttributeOrKeyword;
+
+ while (!lexer.is_eof() && (state != ParseState::Done)) {
+ auto segment = consume_next_segment(lexer);
+ if (!segment.has_value())
+ return {};
+
+ if (state == ParseState::ParsingAttributeOrKeyword)
+ state = is_key(*segment) ? ParseState::ParsingKeyword : ParseState::ParsingAttribute;
+
+ switch (state) {
+ case ParseState::ParsingAttribute:
+ if (is_attribute(*segment)) {
+ locale_extension.attributes.append(*segment);
+ break;
+ }
+
+ state = ParseState::ParsingKeyword;
+ [[fallthrough]];
+
+ case ParseState::ParsingKeyword: {
+ // keyword = key (sep type)?
+ Keyword keyword { .key = *segment };
+
+ if (!is_key(*segment)) {
+ lexer.retreat(segment->length() + 1);
+ state = ParseState::Done;
+ break;
+ }
+
+ while (true) {
+ auto type = consume_next_segment(lexer);
+
+ if (!type.has_value() || !is_single_type(*type)) {
+ if (type.has_value())
+ lexer.retreat(type->length() + 1);
+ break;
+ }
+
+ keyword.types.append(*type);
+ }
+
+ locale_extension.keywords.append(move(keyword));
+ break;
+ }
+
+ default:
+ VERIFY_NOT_REACHED();
+ }
+ }
+
+ if (locale_extension.attributes.is_empty() && locale_extension.keywords.is_empty())
+ return {};
+ return locale_extension;
+}
+
+static Optional<Extension> parse_extension(GenericLexer& lexer)
+{
+ // https://unicode.org/reports/tr35/#extensions
+ //
+ // extensions = unicode_locale_extensions | transformed_extensions | other_extensions
+ size_t starting_position = lexer.tell();
+
+ if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) {
+ switch ((*header)[0]) {
+ case 'u':
+ case 'U':
+ if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value())
+ return Extension { extension.release_value() };
+ break;
+
+ default:
+ // FIXME: Handle transformed_extensions / other_extensions
+ break;
+ }
+ }
+
+ lexer.retreat(lexer.tell() - starting_position);
+ return {};
+}
+
Optional<LanguageID> parse_unicode_language_id(StringView language)
{
GenericLexer lexer { language };
@@ -167,7 +287,6 @@ Optional<LanguageID> parse_unicode_language_id(StringView language)
Optional<LocaleID> parse_unicode_locale_id(StringView locale)
{
GenericLexer lexer { locale };
- LocaleID locale_id {};
// https://unicode.org/reports/tr35/#Unicode_locale_identifier
//
@@ -178,12 +297,21 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
if (!language_id.has_value())
return {};
- // FIXME: Handle extensions and pu_extensions.
+ LocaleID locale_id { language_id.release_value() };
+
+ while (true) {
+ auto extension = parse_extension(lexer);
+ if (!extension.has_value())
+ break;
+ locale_id.extensions.append(extension.release_value());
+ }
+
+ // FIXME: Handle pu_extensions.
if (!lexer.is_eof())
return {};
- return LocaleID { language_id.release_value() };
+ return locale_id;
}
Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)