LibUnicode: Add a method to check if a code point could start an emoji

author: Timothy Flynn <trflynn89@pm.me> 2023-02-23 08:33:22 -0500
committer: Linus Groh <mail@linusgroh.de> 2023-02-24 19:48:47 +0100
commit: 1484d3d9f51e3ccade772931131e7d998933382e (patch)
tree: ecb6b6f9fae4c31e99bbbac36f23beca8b9d55ad /Userland
parent: 8c38d46c1ae4b2fddbef3e89802c43474126d9dd (diff)
download: serenity-1484d3d9f51e3ccade772931131e7d998933382e.zip
2 files changed, 108 insertions, 2 deletions
diff --git a/Userland/Libraries/LibUnicode/Emoji.cpp b/Userland/Libraries/LibUnicode/Emoji.cpp
index 4322342fdf..32e45d9351 100644
--- a/Userland/Libraries/LibUnicode/Emoji.cpp
+++ b/Userland/Libraries/LibUnicode/Emoji.cpp
@@ -1,13 +1,115 @@
 /*
- * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
+ * Copyright (c) 2022-2023, Tim Flynn <trflynn89@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/CharacterTypes.h>
+#include <AK/Utf32View.h>
+#include <AK/Utf8View.h>
+#include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/Emoji.h>
 
+#if ENABLE_UNICODE_DATA
+#    include <LibUnicode/UnicodeData.h>
+#endif
+
 namespace Unicode {
 
 Optional<Emoji> __attribute__((weak)) find_emoji_for_code_points(ReadonlySpan<u32>) { return {}; }
 
+#if ENABLE_UNICODE_DATA
+
+// https://unicode.org/reports/tr51/#def_emoji_core_sequence
+static bool could_be_start_of_emoji_core_sequence(u32 code_point, Optional<u32> const& next_code_point)
+{
+    // emoji_core_sequence := emoji_character | emoji_presentation_sequence | emoji_keycap_sequence | emoji_modifier_sequence | emoji_flag_sequence
+
+    static constexpr auto emoji_presentation_selector = 0xFE0Fu;
+    static constexpr auto combining_enclosing_keycap = 0x20E3u;
+
+    // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
+    // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
+    if (is_ascii_digit(code_point) || code_point == '#' || code_point == '*')
+        return next_code_point == emoji_presentation_selector || next_code_point == combining_enclosing_keycap;
+
+    // A little non-standard, but all other ASCII code points are not the beginning of any emoji sequence.
+    if (is_ascii(code_point))
+        return false;
+
+    // https://unicode.org/reports/tr51/#def_emoji_character
+    if (code_point_has_property(code_point, Property::Emoji))
+        return true;
+
+    // https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
+    // emoji_presentation_sequence := emoji_character emoji_presentation_selector
+    if (next_code_point == emoji_presentation_selector)
+        return true;
+
+    // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
+    // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
+    if (code_point_has_property(code_point, Property::Emoji_Modifier_Base))
+        return true;
+
+    // https://unicode.org/reports/tr51/#def_emoji_flag_sequence
+    // emoji_flag_sequence := regional_indicator regional_indicator
+    if (code_point_has_property(code_point, Property::Regional_Indicator))
+        return true;
+
+    return false;
+}
+
+static bool could_be_start_of_serenity_emoji(u32 code_point)
+{
+    // We use Supplementary Private Use Area-B for custom Serenity emoji, starting at U+10CD00.
+    static constexpr auto first_custom_serenity_emoji_code_point = 0x10CD00u;
+
+    return code_point >= first_custom_serenity_emoji_code_point;
+}
+
+#endif
+
+// https://unicode.org/reports/tr51/#def_emoji_sequence
+template<typename CodePointIterator>
+static bool could_be_start_of_emoji_sequence_impl(CodePointIterator const& it)
+{
+    // emoji_sequence := emoji_core_sequence | emoji_zwj_sequence | emoji_tag_sequence
+
+    if (it.done())
+        return false;
+
+#if ENABLE_UNICODE_DATA
+    // The purpose of this method is to quickly filter out code points that cannot be the start of
+    // an emoji. The emoji_core_sequence definition alone captures the start of all possible
+    // emoji_zwj_sequence and emoji_tag_sequence emojis, because:
+    //
+    //     * emoji_zwj_sequence must begin with emoji_zwj_element, which is:
+    //       emoji_zwj_sequence := emoji_core_sequence | emoji_tag_sequence
+    //
+    //     * emoji_tag_sequence must begin with tag_base, which is:
+    //       tag_base := emoji_character | emoji_modifier_sequence | emoji_presentation_sequence
+    //       Note that this is a subset of emoji_core_sequence.
+    auto code_point = *it;
+    auto next_code_point = it.peek(1);
+
+    if (could_be_start_of_emoji_core_sequence(code_point, next_code_point))
+        return true;
+    if (could_be_start_of_serenity_emoji(code_point))
+        return true;
+    return false;
+#else
+    return true;
+#endif
+}
+
+bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const& it)
+{
+    return could_be_start_of_emoji_sequence_impl(it);
+}
+
+bool could_be_start_of_emoji_sequence(Utf32CodePointIterator const& it)
+{
+    return could_be_start_of_emoji_sequence_impl(it);
+}
+
 }
diff --git a/Userland/Libraries/LibUnicode/Emoji.h b/Userland/Libraries/LibUnicode/Emoji.h
index dcb137ba0a..7e772e9c3e 100644
--- a/Userland/Libraries/LibUnicode/Emoji.h
+++ b/Userland/Libraries/LibUnicode/Emoji.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
+ * Copyright (c) 2022-2023, Tim Flynn <trflynn89@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
 #pragma once
 
+#include <AK/Forward.h>
 #include <AK/Optional.h>
 #include <AK/StringView.h>
 #include <AK/Types.h>
@@ -46,6 +47,9 @@ Optional<Emoji> find_emoji_for_code_points(u32 const (&code_points)[Size])
     return find_emoji_for_code_points(ReadonlySpan<u32> { code_points });
 }
 
+bool could_be_start_of_emoji_sequence(Utf8CodePointIterator const&);
+bool could_be_start_of_emoji_sequence(Utf32CodePointIterator const&);
+
 constexpr StringView emoji_group_to_string(EmojiGroup group)
 {
     switch (group) {
author	Timothy Flynn <trflynn89@pm.me>	2023-02-23 08:33:22 -0500
committer	Linus Groh <mail@linusgroh.de>	2023-02-24 19:48:47 +0100
commit	1484d3d9f51e3ccade772931131e7d998933382e (patch)
tree	ecb6b6f9fae4c31e99bbbac36f23beca8b9d55ad /Userland
parent	8c38d46c1ae4b2fddbef3e89802c43474126d9dd (diff)
download	serenity-1484d3d9f51e3ccade772931131e7d998933382e.zip