diff options
author | Timothy Flynn <trflynn89@pm.me> | 2023-02-23 08:33:22 -0500 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2023-02-24 19:48:47 +0100 |
commit | 1484d3d9f51e3ccade772931131e7d998933382e (patch) | |
tree | ecb6b6f9fae4c31e99bbbac36f23beca8b9d55ad /Userland | |
parent | 8c38d46c1ae4b2fddbef3e89802c43474126d9dd (diff) | |
download | serenity-1484d3d9f51e3ccade772931131e7d998933382e.zip |
LibUnicode: Add a method to check if a code point could start an emoji
Diffstat (limited to 'Userland')
-rw-r--r-- | Userland/Libraries/LibUnicode/Emoji.cpp | 104 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Emoji.h | 6 |
2 files changed, 108 insertions, 2 deletions
diff --git a/Userland/Libraries/LibUnicode/Emoji.cpp b/Userland/Libraries/LibUnicode/Emoji.cpp index 4322342fdf..32e45d9351 100644 --- a/Userland/Libraries/LibUnicode/Emoji.cpp +++ b/Userland/Libraries/LibUnicode/Emoji.cpp @@ -1,13 +1,115 @@ /* - * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org> + * Copyright (c) 2022-2023, Tim Flynn <trflynn89@serenityos.org> * * SPDX-License-Identifier: BSD-2-Clause */ +#include <AK/CharacterTypes.h> +#include <AK/Utf32View.h> +#include <AK/Utf8View.h> +#include <LibUnicode/CharacterTypes.h> #include <LibUnicode/Emoji.h> +#if ENABLE_UNICODE_DATA +# include <LibUnicode/UnicodeData.h> +#endif + namespace Unicode { Optional<Emoji> __attribute__((weak)) find_emoji_for_code_points(ReadonlySpan<u32>) { return {}; } +#if ENABLE_UNICODE_DATA + +// https://unicode.org/reports/tr51/#def_emoji_core_sequence +static bool could_be_start_of_emoji_core_sequence(u32 code_point, Optional<u32> const& next_code_point) +{ + // emoji_core_sequence := emoji_character | emoji_presentation_sequence | emoji_keycap_sequence | emoji_modifier_sequence | emoji_flag_sequence + + static constexpr auto emoji_presentation_selector = 0xFE0Fu; + static constexpr auto combining_enclosing_keycap = 0x20E3u; + + // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence + // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3} + if (is_ascii_digit(code_point) || code_point == '#' || code_point == '*') + return next_code_point == emoji_presentation_selector || next_code_point == combining_enclosing_keycap; + + // A little non-standard, but all other ASCII code points are not the beginning of any emoji sequence. + if (is_ascii(code_point)) + return false; + + // https://unicode.org/reports/tr51/#def_emoji_character + if (code_point_has_property(code_point, Property::Emoji)) + return true; + + // https://unicode.org/reports/tr51/#def_emoji_presentation_sequence + // emoji_presentation_sequence := emoji_character emoji_presentation_selector + if (next_code_point == emoji_presentation_selector) + return true; + + // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence + // emoji_modifier_sequence := emoji_modifier_base emoji_modifier + if (code_point_has_property(code_point, Property::Emoji_Modifier_Base)) + return true; + + // https://unicode.org/reports/tr51/#def_emoji_flag_sequence + // emoji_flag_sequence := regional_indicator regional_indicator + if (code_point_has_property(code_point, Property::Regional_Indicator)) + return true; + + return false; +} + +static bool could_be_start_of_serenity_emoji(u32 code_point) +{ + // We use Supplementary Private Use Area-B for custom Serenity emoji, starting at U+10CD00. + static constexpr auto first_custom_serenity_emoji_code_point = 0x10CD00u; + + return code_point >= first_custom_serenity_emoji_code_point; +} + +#endif + +// https://unicode.org/reports/tr51/#def_emoji_sequence +template<typename CodePointIterator> +static bool could_be_start_of_emoji_sequence_impl(CodePointIterator const& it) +{ + // emoji_sequence := emoji_core_sequence | emoji_zwj_sequence | emoji_tag_sequence + + if (it.done()) + return false; + +#if ENABLE_UNICODE_DATA + // The purpose of this method is to quickly filter out code points that cannot be the start of + // an emoji. The emoji_core_sequence definition alone captures the start of all possible + // emoji_zwj_sequence and emoji_tag_sequence emojis, because: + // + // * emoji_zwj_sequence must begin with emoji_zwj_element, which is: + // emoji_zwj_sequence := emoji_core_sequence | emoji_tag_sequence + // + // * emoji_tag_sequence must begin with tag_base, which is: + // tag_base := emoji_character | emoji_modifier_sequence | emoji_presentation_sequence + // Note that this is a subset of emoji_core_sequence. + auto code_point = *it; + auto next_code_point = it.peek(1); + + if (could_be_start_of_emoji_core_sequence(code_point, next_code_point)) + return true; + if (could_be_start_of_serenity_emoji(code_point)) + return true; + return false; +#else + return true; +#endif +} + +bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const& it) +{ + return could_be_start_of_emoji_sequence_impl(it); +} + +bool could_be_start_of_emoji_sequence(Utf32CodePointIterator const& it) +{ + return could_be_start_of_emoji_sequence_impl(it); +} + } diff --git a/Userland/Libraries/LibUnicode/Emoji.h b/Userland/Libraries/LibUnicode/Emoji.h index dcb137ba0a..7e772e9c3e 100644 --- a/Userland/Libraries/LibUnicode/Emoji.h +++ b/Userland/Libraries/LibUnicode/Emoji.h @@ -1,11 +1,12 @@ /* - * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org> + * Copyright (c) 2022-2023, Tim Flynn <trflynn89@serenityos.org> * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once +#include <AK/Forward.h> #include <AK/Optional.h> #include <AK/StringView.h> #include <AK/Types.h> @@ -46,6 +47,9 @@ Optional<Emoji> find_emoji_for_code_points(u32 const (&code_points)[Size]) return find_emoji_for_code_points(ReadonlySpan<u32> { code_points }); } +bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const&); +bool could_be_start_of_emoji_sequence(Utf32CodePointIterator const&); + constexpr StringView emoji_group_to_string(EmojiGroup group) { switch (group) { |