summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibUnicode
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-08-27 16:38:06 -0400
committerLinus Groh <mail@linusgroh.de>2021-08-30 19:42:40 +0100
commiteda92d15e4c5f6eb91263695c42997b45de99c98 (patch)
tree7e0cde51b0484fcc305f07e71400bd9ff72633e4 /Userland/Libraries/LibUnicode
parentdd89901b070c07cd54095931c4b03fc5beb3316d (diff)
downloadserenity-eda92d15e4c5f6eb91263695c42997b45de99c98.zip
LibUnicode: Parse locale extensions of the Unicode locale extension form
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r--Userland/Libraries/LibUnicode/Locale.cpp136
-rw-r--r--Userland/Libraries/LibUnicode/Locale.h14
2 files changed, 146 insertions, 4 deletions
diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp
index 4ccf9e9a89..6f52ef25a8 100644
--- a/Userland/Libraries/LibUnicode/Locale.cpp
+++ b/Userland/Libraries/LibUnicode/Locale.cpp
@@ -53,7 +53,32 @@ bool is_unicode_variant_subtag(StringView subtag)
return false;
}
-static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator)
+static bool is_key(StringView key)
+{
+ // key = alphanum alpha
+ if (key.length() != 2)
+ return false;
+ return is_ascii_alphanumeric(key[0]) && is_ascii_alpha(key[1]);
+}
+
+static bool is_single_type(StringView type)
+{
+ // type = alphanum{3,8} (sep alphanum{3,8})*
+ // Note: Consecutive types are not handled here, that is left to the caller.
+ if ((type.length() < 3) || (type.length() > 8))
+ return false;
+ return all_of(type, is_ascii_alphanumeric);
+}
+
+static bool is_attribute(StringView type)
+{
+ // attribute = alphanum{3,8}
+ if ((type.length() < 3) || (type.length() > 8))
+ return false;
+ return all_of(type, is_ascii_alphanumeric);
+}
+
+static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator = true)
{
constexpr auto is_separator = is_any_of("-_"sv);
@@ -153,6 +178,101 @@ static Optional<LanguageID> parse_unicode_language_id(GenericLexer& lexer)
return language_id;
}
+static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& lexer)
+{
+ // https://unicode.org/reports/tr35/#unicode_locale_extensions
+ //
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ | (sep attribute)+ (sep keyword)*)
+ LocaleExtension locale_extension {};
+
+ enum class ParseState {
+ ParsingAttributeOrKeyword,
+ ParsingAttribute,
+ ParsingKeyword,
+ Done,
+ };
+
+ auto state = ParseState::ParsingAttributeOrKeyword;
+
+ while (!lexer.is_eof() && (state != ParseState::Done)) {
+ auto segment = consume_next_segment(lexer);
+ if (!segment.has_value())
+ return {};
+
+ if (state == ParseState::ParsingAttributeOrKeyword)
+ state = is_key(*segment) ? ParseState::ParsingKeyword : ParseState::ParsingAttribute;
+
+ switch (state) {
+ case ParseState::ParsingAttribute:
+ if (is_attribute(*segment)) {
+ locale_extension.attributes.append(*segment);
+ break;
+ }
+
+ state = ParseState::ParsingKeyword;
+ [[fallthrough]];
+
+ case ParseState::ParsingKeyword: {
+ // keyword = key (sep type)?
+ Keyword keyword { .key = *segment };
+
+ if (!is_key(*segment)) {
+ lexer.retreat(segment->length() + 1);
+ state = ParseState::Done;
+ break;
+ }
+
+ while (true) {
+ auto type = consume_next_segment(lexer);
+
+ if (!type.has_value() || !is_single_type(*type)) {
+ if (type.has_value())
+ lexer.retreat(type->length() + 1);
+ break;
+ }
+
+ keyword.types.append(*type);
+ }
+
+ locale_extension.keywords.append(move(keyword));
+ break;
+ }
+
+ default:
+ VERIFY_NOT_REACHED();
+ }
+ }
+
+ if (locale_extension.attributes.is_empty() && locale_extension.keywords.is_empty())
+ return {};
+ return locale_extension;
+}
+
+static Optional<Extension> parse_extension(GenericLexer& lexer)
+{
+ // https://unicode.org/reports/tr35/#extensions
+ //
+ // extensions = unicode_locale_extensions | transformed_extensions | other_extensions
+ size_t starting_position = lexer.tell();
+
+ if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) {
+ switch ((*header)[0]) {
+ case 'u':
+ case 'U':
+ if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value())
+ return Extension { extension.release_value() };
+ break;
+
+ default:
+ // FIXME: Handle transformed_extensions / other_extensions
+ break;
+ }
+ }
+
+ lexer.retreat(lexer.tell() - starting_position);
+ return {};
+}
+
Optional<LanguageID> parse_unicode_language_id(StringView language)
{
GenericLexer lexer { language };
@@ -167,7 +287,6 @@ Optional<LanguageID> parse_unicode_language_id(StringView language)
Optional<LocaleID> parse_unicode_locale_id(StringView locale)
{
GenericLexer lexer { locale };
- LocaleID locale_id {};
// https://unicode.org/reports/tr35/#Unicode_locale_identifier
//
@@ -178,12 +297,21 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
if (!language_id.has_value())
return {};
- // FIXME: Handle extensions and pu_extensions.
+ LocaleID locale_id { language_id.release_value() };
+
+ while (true) {
+ auto extension = parse_extension(lexer);
+ if (!extension.has_value())
+ break;
+ locale_id.extensions.append(extension.release_value());
+ }
+
+ // FIXME: Handle pu_extensions.
if (!lexer.is_eof())
return {};
- return LocaleID { language_id.release_value() };
+ return locale_id;
}
Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)
diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h
index d86cb97d6f..742970a125 100644
--- a/Userland/Libraries/LibUnicode/Locale.h
+++ b/Userland/Libraries/LibUnicode/Locale.h
@@ -9,6 +9,7 @@
#include <AK/Optional.h>
#include <AK/String.h>
#include <AK/StringView.h>
+#include <AK/Variant.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h>
@@ -22,8 +23,21 @@ struct LanguageID {
Vector<StringView> variants {};
};
+struct Keyword {
+ StringView key {};
+ Vector<StringView> types {};
+};
+
+struct LocaleExtension {
+ Vector<StringView> attributes {};
+ Vector<Keyword> keywords {};
+};
+
+using Extension = Variant<LocaleExtension>;
+
struct LocaleID {
LanguageID language_id {};
+ Vector<Extension> extensions {};
};
// Note: These methods only verify that the provided strings match the EBNF grammar of the