diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-07-28 21:45:09 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-07-30 21:26:31 +0100 |
commit | f1809db9946200bcbe86df8c3d6ebf87b44ec0a1 (patch) | |
tree | bde429aaa4f69b5836f688e0bef032ec72e5d9de /Userland/Libraries/LibUnicode | |
parent | 3f80791ed535e0752109cb8900b56e7244e01669 (diff) | |
download | serenity-f1809db9946200bcbe86df8c3d6ebf87b44ec0a1.zip |
LibUnicode: Add public methods to compare and lookup Unicode properties
Adds methods to retrieve a Unicode property from a string and to check
if a code point matches a Unicode property.
Also adds a <LibUnicode/Forward.h> header.
Diffstat (limited to 'Userland/Libraries/LibUnicode')
4 files changed, 92 insertions, 9 deletions
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 0cdf901721..411da0da9f 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -4,6 +4,7 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include <AK/CharacterTypes.h> #include <AK/Platform.h> #include <AK/StringBuilder.h> #include <AK/Types.h> @@ -12,8 +13,6 @@ #if ENABLE_UNICODE_DATA # include <LibUnicode/UnicodeData.h> -#else -# include <AK/CharacterTypes.h> #endif // For details on the algorithms used here, see Section 3.13 Default Case Algorithms @@ -41,7 +40,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt size_t cased_letter_count = 0; for (auto code_point : preceding_view) { - auto unicode_data = unicode_data_for_code_point(code_point); + auto unicode_data = Detail::unicode_data_for_code_point(code_point); if (!unicode_data.has_value()) return false; @@ -58,7 +57,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt return false; for (auto code_point : following_view) { - auto unicode_data = unicode_data_for_code_point(code_point); + auto unicode_data = Detail::unicode_data_for_code_point(code_point); if (!unicode_data.has_value()) return false; @@ -107,7 +106,7 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, s u32 to_unicode_lowercase(u32 code_point) { #if ENABLE_UNICODE_DATA - auto unicode_data = unicode_data_for_code_point(code_point); + auto unicode_data = Detail::unicode_data_for_code_point(code_point); if (unicode_data.has_value()) return unicode_data->simple_lowercase_mapping; return code_point; @@ -119,7 +118,7 @@ u32 to_unicode_lowercase(u32 code_point) u32 to_unicode_uppercase(u32 code_point) { #if ENABLE_UNICODE_DATA - auto unicode_data = unicode_data_for_code_point(code_point); + auto unicode_data = Detail::unicode_data_for_code_point(code_point); if (unicode_data.has_value()) return unicode_data->simple_uppercase_mapping; return code_point; @@ -139,7 +138,7 @@ String to_unicode_lowercase_full(StringView const& string) u32 code_point = *it; size_t byte_length = it.underlying_code_point_length_in_bytes(); - auto unicode_data = unicode_data_for_code_point(code_point); + auto unicode_data = Detail::unicode_data_for_code_point(code_point); if (!unicode_data.has_value()) { builder.append_code_point(code_point); index += byte_length; @@ -174,7 +173,7 @@ String to_unicode_uppercase_full(StringView const& string) u32 code_point = *it; size_t byte_length = it.underlying_code_point_length_in_bytes(); - auto unicode_data = unicode_data_for_code_point(code_point); + auto unicode_data = Detail::unicode_data_for_code_point(code_point); if (!unicode_data.has_value()) { builder.append_code_point(code_point); index += byte_length; @@ -198,4 +197,29 @@ String to_unicode_uppercase_full(StringView const& string) #endif } +Optional<Property> property_from_string([[maybe_unused]] StringView const& property) +{ +#if ENABLE_UNICODE_DATA + return Detail::property_from_string(property); +#else + return {}; +#endif +} + +bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property) +{ +#if ENABLE_UNICODE_DATA + if (property == Property::Any) + return is_unicode(code_point); + + auto unicode_data = Detail::unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) + return false; + + return has_property(*unicode_data, property); +#else + return false; +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index beb2288cfb..eac6e79293 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -9,6 +9,7 @@ #include <AK/Forward.h> #include <AK/String.h> #include <AK/Types.h> +#include <LibUnicode/Forward.h> namespace Unicode { @@ -20,4 +21,7 @@ u32 to_unicode_uppercase(u32 code_point); String to_unicode_lowercase_full(StringView const&); String to_unicode_uppercase_full(StringView const&); +Optional<Property> property_from_string(StringView const&); +bool code_point_has_property(u32 code_point, Property property); + } diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 9744fc92e5..7132d86dd6 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -404,6 +404,7 @@ constexpr @name@ operator|(@name@ value1, @name@ value2) #include <AK/Optional.h> #include <AK/Types.h> +#include <LibUnicode/Forward.h> namespace Unicode { )~~~"); @@ -411,7 +412,7 @@ namespace Unicode { generate_enum("Locale"sv, "None"sv, move(unicode_data.locales)); generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions)); generate_enum("GeneralCategory"sv, {}, move(unicode_data.general_categories)); - generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), move(unicode_data.prop_aliases), true); + generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true); generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys()); generator.append(R"~~~( @@ -469,7 +470,12 @@ struct UnicodeData { WordBreakProperty word_break_property { WordBreakProperty::Other }; }; +namespace Detail { + Optional<UnicodeData> unicode_data_for_code_point(u32 code_point); +Optional<Property> property_from_string(StringView const& property); + +} })~~~"); @@ -489,6 +495,7 @@ static void generate_unicode_data_implementation(UnicodeData unicode_data) #include <AK/Array.h> #include <AK/CharacterTypes.h> #include <AK/Find.h> +#include <AK/StringView.h> #include <LibUnicode/UnicodeData.h> namespace Unicode { @@ -597,6 +604,8 @@ static Optional<u32> index_of_code_point_in_range(u32 code_point) return {}; } +namespace Detail { + Optional<UnicodeData> unicode_data_for_code_point(u32 code_point) { VERIFY(is_unicode(code_point)); @@ -618,6 +627,30 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point) return {}; } +Optional<Property> property_from_string(StringView const& property) +{ + if (property == "Assigned"sv) + return Property::Assigned;)~~~"); + + for (auto const& property : unicode_data.prop_list) { + generator.set("property", property.key); + generator.append(R"~~~( + if (property == "@property@"sv) + return Property::@property@;)~~~"); + } + for (auto const& alias : unicode_data.prop_aliases) { + generator.set("property", alias.alias); + generator.append(R"~~~( + if (property == "@property@"sv) + return Property::@property@;)~~~"); + } + + generator.append(R"~~~( + return {}; +} + +} + })~~~"); outln("{}", generator.as_string_view()); diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h new file mode 100644 index 0000000000..0c6aa4cb08 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/Types.h> + +namespace Unicode { + +enum class Condition; +enum class GeneralCategory; +enum class Locale; +enum class Property : u64; +enum class WordBreakProperty; + +struct SpecialCasing; +struct UnicodeData; + +} |