/* * Copyright (c) 2021, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include namespace Unicode { bool is_unicode_language_subtag(StringView subtag) { // unicode_language_subtag = alpha{2,3} | alpha{5,8} if ((subtag.length() < 2) || (subtag.length() == 4) || (subtag.length() > 8)) return false; return all_of(subtag, is_ascii_alpha); } bool is_unicode_script_subtag(StringView subtag) { // unicode_script_subtag = alpha{4} if (subtag.length() != 4) return false; return all_of(subtag, is_ascii_alpha); } bool is_unicode_region_subtag(StringView subtag) { // unicode_region_subtag = (alpha{2} | digit{3}) if (subtag.length() == 2) return all_of(subtag, is_ascii_alpha); if (subtag.length() == 3) return all_of(subtag, is_ascii_digit); return false; } bool is_unicode_variant_subtag(StringView subtag) { // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) if ((subtag.length() >= 5) && (subtag.length() <= 8)) return all_of(subtag, is_ascii_alphanumeric); if (subtag.length() == 4) return is_ascii_digit(subtag[0]) && all_of(subtag.substring_view(1), is_ascii_alphanumeric); return false; } Optional parse_unicode_language_id(StringView language) { // https://unicode.org/reports/tr35/#Unicode_language_identifier // // unicode_language_id = "root" // OR // unicode_language_id = ((unicode_language_subtag (sep unicode_script_subtag)?) | unicode_script_subtag) // (sep unicode_region_subtag)? // (sep unicode_variant_subtag)* LanguageID language_id {}; if (language == "root"sv) { language_id.is_root = true; return language_id; } auto segments = language.split_view_if(is_any_of("-_"sv), true); // keep_empty=true to ensure valid data follows a separator. size_t index = 0; if (segments.size() == index) return {}; if (is_unicode_language_subtag(segments[index])) { language_id.language = segments[index]; if (segments.size() == ++index) return language_id; } if (is_unicode_script_subtag(segments[index])) { language_id.script = segments[index]; if (segments.size() == ++index) return language_id; } else if (!language_id.language.has_value()) { return {}; } if (is_unicode_region_subtag(segments[index])) { language_id.region = segments[index]; if (segments.size() == ++index) return language_id; } while (index < segments.size()) { if (!is_unicode_variant_subtag(segments[index])) return {}; language_id.variants.append(segments[index++]); } return language_id; } Optional parse_unicode_locale_id(StringView locale) { LocaleID locale_id {}; // https://unicode.org/reports/tr35/#Unicode_locale_identifier // // unicode_locale_id = unicode_language_id // extensions* // pu_extensions? auto language_id = parse_unicode_language_id(locale); if (!language_id.has_value()) return {}; // FIXME: Handle extensions and pu_extensions. return LocaleID { language_id.release_value() }; } Optional canonicalize_unicode_locale_id(LocaleID& locale_id) { // https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers StringBuilder builder; if (!locale_id.language_id.language.has_value()) return {}; builder.append(locale_id.language_id.language->to_lowercase_string()); if (locale_id.language_id.script.has_value()) { builder.append('-'); builder.append(locale_id.language_id.script->to_titlecase_string()); } if (locale_id.language_id.region.has_value()) { builder.append('-'); builder.append(locale_id.language_id.region->to_uppercase_string()); } quick_sort(locale_id.language_id.variants); for (auto const& variant : locale_id.language_id.variants) { builder.append('-'); builder.append(variant.to_lowercase_string()); } // FIXME: Handle extensions and pu_extensions. return builder.build(); } }