summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2022-09-07 13:39:31 -0400
committerLinus Groh <mail@linusgroh.de>2022-09-08 23:12:31 +0100
commitb61eca0a1e401202e8315629c3d87158381c9518 (patch)
treedda2f0ef4e7938707652e2d24b72510ecdfbb467
parentfff79379d4270ece49cce85e67d09f0e5ab7315d (diff)
downloadserenity-b61eca0a1e401202e8315629c3d87158381c9518.zip
LibUncode: Parse and generate emoji code point data
According to TR #51, the "best definition of the full set [of emojis] is in the emoji-test.txt file". This defines not only the emoji themselves, but the order in which they should be displayed, and what "group" of emojis they belong to.
-rw-r--r--Meta/CMake/unicode_data.cmake17
-rw-r--r--Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt1
-rw-r--r--Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp220
-rw-r--r--Userland/Libraries/LibUnicode/CMakeLists.txt1
-rw-r--r--Userland/Libraries/LibUnicode/Emoji.cpp13
-rw-r--r--Userland/Libraries/LibUnicode/Emoji.h97
-rw-r--r--Userland/Libraries/LibUnicode/Forward.h2
7 files changed, 351 insertions, 0 deletions
diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake
index 88e9a9b64e..2751035068 100644
--- a/Meta/CMake/unicode_data.cmake
+++ b/Meta/CMake/unicode_data.cmake
@@ -92,12 +92,18 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h)
set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp)
+ set(EMOJI_DATA_HEADER LibUnicode/EmojiData.h)
+ set(EMOJI_DATA_IMPLEMENTATION LibUnicode/EmojiData.cpp)
+
set(UNICODE_META_TARGET_PREFIX LibUnicode_)
if (CMAKE_CURRENT_BINARY_DIR MATCHES ".*/LibUnicode") # Serenity build.
set(UNICODE_DATA_HEADER UnicodeData.h)
set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp)
+ set(EMOJI_DATA_HEADER EmojiData.h)
+ set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp)
+
set(UNICODE_META_TARGET_PREFIX "")
endif()
@@ -110,6 +116,15 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UNICODE_DATA_IMPLEMENTATION}"
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
)
+ invoke_generator(
+ "EmojiData"
+ Lagom::GenerateEmojiData
+ "${UCD_VERSION_FILE}"
+ "${UNICODE_META_TARGET_PREFIX}"
+ "${EMOJI_DATA_HEADER}"
+ "${EMOJI_DATA_IMPLEMENTATION}"
+ arguments -e "${EMOJI_TEST_PATH}"
+ )
if (CMAKE_CURRENT_BINARY_DIR MATCHES ".*/LibUnicode") # Serenity build.
add_custom_command(
@@ -128,5 +143,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
set(UNICODE_DATA_SOURCES
${UNICODE_DATA_HEADER}
${UNICODE_DATA_IMPLEMENTATION}
+ ${EMOJI_DATA_HEADER}
+ ${EMOJI_DATA_IMPLEMENTATION}
)
endif()
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt
index 281c97f413..b18637a184 100644
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt
@@ -1 +1,2 @@
lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain)
+lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain)
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp
new file mode 100644
index 0000000000..667d324c05
--- /dev/null
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include "GeneratorUtil.h"
+#include <AK/SourceGenerator.h>
+#include <AK/String.h>
+#include <AK/StringUtils.h>
+#include <AK/Types.h>
+#include <LibCore/ArgsParser.h>
+#include <LibCore/Stream.h>
+#include <LibUnicode/Emoji.h>
+
+using StringIndexType = u16;
+constexpr auto s_string_index_type = "u16"sv;
+
+struct Emoji {
+ StringIndexType name { 0 };
+ Unicode::EmojiGroup group;
+ u32 display_order { 0 };
+ String code_points_name;
+ Vector<u32> code_points;
+};
+
+struct EmojiData {
+ UniqueStringStorage<StringIndexType> unique_strings;
+ Vector<Emoji> emojis;
+};
+
+static ErrorOr<void> parse_emoji_test_data(Core::Stream::BufferedFile& file, EmojiData& emoji_data)
+{
+ static constexpr auto group_header = "# group: "sv;
+
+ Array<u8, 1024> buffer;
+
+ Unicode::EmojiGroup group;
+ u32 display_order { 0 };
+
+ while (TRY(file.can_read_line())) {
+ auto line = TRY(file.read_line(buffer));
+ if (line.is_empty())
+ continue;
+
+ if (line.starts_with('#')) {
+ if (line.starts_with(group_header)) {
+ auto name = line.substring_view(group_header.length());
+ group = Unicode::emoji_group_from_string(name);
+ }
+
+ continue;
+ }
+
+ auto status_index = line.find(';');
+ VERIFY(status_index.has_value());
+
+ auto emoji_and_name_index = line.find('#', *status_index);
+ VERIFY(emoji_and_name_index.has_value());
+
+ // FIXME: Should we keep non-fully-qualified emoji? TR #51 states this is implementation defined.
+ auto status = line.substring_view(*status_index + 1, *emoji_and_name_index - *status_index - 1).trim_whitespace();
+ if (status != "fully-qualified"sv)
+ continue;
+
+ Emoji emoji {};
+ emoji.group = group;
+ emoji.display_order = display_order++;
+
+ auto code_points = line.substring_view(0, *status_index).split_view(' ');
+ TRY(emoji.code_points.try_ensure_capacity(code_points.size()));
+
+ for (auto code_point : code_points) {
+ auto value = AK::StringUtils::convert_to_uint_from_hex<u32>(code_point);
+ VERIFY(value.has_value());
+
+ emoji.code_points.unchecked_append(*value);
+ }
+
+ auto emoji_and_name = line.substring_view(*emoji_and_name_index + 1);
+
+ auto emoji_and_name_spaces = emoji_and_name.find_all(" "sv);
+ VERIFY(emoji_and_name_spaces.size() > 2);
+
+ auto name = emoji_and_name.substring_view(emoji_and_name_spaces[2]).trim_whitespace();
+ emoji.name = emoji_data.unique_strings.ensure(name.to_titlecase_string());
+ emoji.code_points_name = String::join('_', code_points);
+
+ TRY(emoji_data.emojis.try_append(move(emoji)));
+ }
+
+ return {};
+}
+
+static ErrorOr<void> generate_emoji_data_header(Core::Stream::BufferedFile& file, EmojiData const&)
+{
+ StringBuilder builder;
+ SourceGenerator generator { builder };
+
+ TRY(file.write(generator.as_string_view().bytes()));
+ return {};
+}
+
+static ErrorOr<void> generate_emoji_data_implementation(Core::Stream::BufferedFile& file, EmojiData const& emoji_data)
+{
+ StringBuilder builder;
+ SourceGenerator generator { builder };
+
+ generator.set("string_index_type"sv, s_string_index_type);
+ generator.set("emojis_size"sv, String::number(emoji_data.emojis.size()));
+
+ generator.append(R"~~~(
+#include <AK/Array.h>
+#include <AK/BinarySearch.h>
+#include <AK/Span.h>
+#include <AK/StringView.h>
+#include <AK/Types.h>
+#include <LibUnicode/Emoji.h>
+#include <LibUnicode/EmojiData.h>
+
+namespace Unicode {
+)~~~");
+
+ emoji_data.unique_strings.generate(generator);
+
+ generator.append(R"~~~(
+struct EmojiData {
+ constexpr Emoji to_unicode_emoji() const
+ {
+ Emoji emoji {};
+ emoji.name = decode_string(name);
+ emoji.group = static_cast<EmojiGroup>(group);
+ emoji.display_order = display_order;
+ emoji.code_points = code_points;
+
+ return emoji;
+ }
+
+ @string_index_type@ name { 0 };
+ u8 group { 0 };
+ u32 display_order { 0 };
+ Span<u32 const> code_points;
+};
+)~~~");
+
+ for (auto const& emoji : emoji_data.emojis) {
+ generator.set("name"sv, emoji.code_points_name);
+ generator.set("size"sv, String::number(emoji.code_points.size()));
+
+ generator.append(R"~~~(
+static constexpr Array<u32, @size@> s_@name@ { {)~~~");
+
+ bool first = true;
+ for (auto code_point : emoji.code_points) {
+ generator.append(first ? " "sv : ", "sv);
+ generator.append(String::formatted("{:#x}", code_point));
+ first = false;
+ }
+
+ generator.append(" } };"sv);
+ }
+
+ generator.append(R"~~~(
+
+static constexpr Array<EmojiData, @emojis_size@> s_emojis { {)~~~");
+
+ for (auto const& emoji : emoji_data.emojis) {
+ generator.set("name"sv, String::number(emoji.name));
+ generator.set("group"sv, String::number(to_underlying(emoji.group)));
+ generator.set("display_order"sv, String::number(emoji.display_order));
+ generator.set("code_points_name"sv, emoji.code_points_name);
+
+ generator.append(R"~~~(
+ { @name@, @group@, @display_order@, s_@code_points_name@ },)~~~");
+ }
+
+ generator.append(R"~~~(
+} };
+
+Optional<Emoji> find_emoji_for_code_points(Span<u32 const> code_points)
+{
+ for (auto& emoji : s_emojis) {
+ if (emoji.code_points == code_points)
+ return emoji.to_unicode_emoji();
+ }
+
+ return {};
+}
+
+}
+)~~~");
+
+ TRY(file.write(generator.as_string_view().bytes()));
+ return {};
+}
+
+ErrorOr<int> serenity_main(Main::Arguments arguments)
+{
+ StringView generated_header_path;
+ StringView generated_implementation_path;
+ StringView emoji_test_path;
+
+ Core::ArgsParser args_parser;
+ args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
+ args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
+ args_parser.add_option(emoji_test_path, "Path to emoji-test.txt file", "emoji-test-path", 'e', "emoji-test-path");
+ args_parser.parse(arguments);
+
+ auto generated_header_file = TRY(open_file(generated_header_path, Core::Stream::OpenMode::Write));
+ auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::Stream::OpenMode::Write));
+ auto emoji_test_file = TRY(open_file(emoji_test_path, Core::Stream::OpenMode::Read));
+
+ EmojiData emoji_data {};
+ TRY(parse_emoji_test_data(*emoji_test_file, emoji_data));
+
+ TRY(generate_emoji_data_header(*generated_header_file, emoji_data));
+ TRY(generate_emoji_data_implementation(*generated_implementation_file, emoji_data));
+
+ return 0;
+}
diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt
index 1bacfa8375..c6210ab217 100644
--- a/Userland/Libraries/LibUnicode/CMakeLists.txt
+++ b/Userland/Libraries/LibUnicode/CMakeLists.txt
@@ -3,6 +3,7 @@ include(${SerenityOS_SOURCE_DIR}/Meta/CMake/unicode_data.cmake)
set(SOURCES
CharacterTypes.cpp
CurrencyCode.cpp
+ Emoji.cpp
${UNICODE_DATA_SOURCES}
)
diff --git a/Userland/Libraries/LibUnicode/Emoji.cpp b/Userland/Libraries/LibUnicode/Emoji.cpp
new file mode 100644
index 0000000000..ce0974b9a8
--- /dev/null
+++ b/Userland/Libraries/LibUnicode/Emoji.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibUnicode/Emoji.h>
+
+namespace Unicode {
+
+Optional<Emoji> __attribute__((weak)) find_emoji_for_code_points(Span<u32 const>) { return {}; }
+
+}
diff --git a/Userland/Libraries/LibUnicode/Emoji.h b/Userland/Libraries/LibUnicode/Emoji.h
new file mode 100644
index 0000000000..ca5d81b839
--- /dev/null
+++ b/Userland/Libraries/LibUnicode/Emoji.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Optional.h>
+#include <AK/StringView.h>
+#include <AK/Types.h>
+
+namespace Unicode {
+
+enum class EmojiGroup : u8 {
+ SmileysAndEmotion,
+ PeopleAndBody,
+ Component,
+ AnimalsAndNature,
+ FoodAndDrink,
+ TravelAndPlaces,
+ Activities,
+ Objects,
+ Symbols,
+ Flags,
+};
+
+struct Emoji {
+ StringView name;
+ EmojiGroup group;
+ u32 display_order { 0 };
+ Span<u32 const> code_points;
+};
+
+Optional<Emoji> find_emoji_for_code_points(Span<u32 const> code_points);
+
+template<size_t Size>
+Optional<Emoji> find_emoji_for_code_points(u32 const (&code_points)[Size])
+{
+ return find_emoji_for_code_points(Span<u32 const> { code_points });
+}
+
+constexpr StringView emoji_group_to_string(EmojiGroup group)
+{
+ switch (group) {
+ case EmojiGroup::SmileysAndEmotion:
+ return "Smileys & Emotion"sv;
+ case EmojiGroup::PeopleAndBody:
+ return "People & Body"sv;
+ case EmojiGroup::Component:
+ return "Component"sv;
+ case EmojiGroup::AnimalsAndNature:
+ return "Animals & Nature"sv;
+ case EmojiGroup::FoodAndDrink:
+ return "Food & Drink"sv;
+ case EmojiGroup::TravelAndPlaces:
+ return "Travel & Places"sv;
+ case EmojiGroup::Activities:
+ return "Activities"sv;
+ case EmojiGroup::Objects:
+ return "Objects"sv;
+ case EmojiGroup::Symbols:
+ return "Symbols"sv;
+ case EmojiGroup::Flags:
+ return "Flags"sv;
+ }
+
+ VERIFY_NOT_REACHED();
+}
+
+constexpr EmojiGroup emoji_group_from_string(StringView group)
+{
+ if (group == "Smileys & Emotion"sv)
+ return EmojiGroup::SmileysAndEmotion;
+ if (group == "People & Body"sv)
+ return EmojiGroup::PeopleAndBody;
+ if (group == "Component"sv)
+ return EmojiGroup::Component;
+ if (group == "Animals & Nature"sv)
+ return EmojiGroup::AnimalsAndNature;
+ if (group == "Food & Drink"sv)
+ return EmojiGroup::FoodAndDrink;
+ if (group == "Travel & Places"sv)
+ return EmojiGroup::TravelAndPlaces;
+ if (group == "Activities"sv)
+ return EmojiGroup::Activities;
+ if (group == "Objects"sv)
+ return EmojiGroup::Objects;
+ if (group == "Symbols"sv)
+ return EmojiGroup::Symbols;
+ if (group == "Flags"sv)
+ return EmojiGroup::Flags;
+
+ VERIFY_NOT_REACHED();
+}
+
+}
diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h
index c3b08d3088..22cf7c698e 100644
--- a/Userland/Libraries/LibUnicode/Forward.h
+++ b/Userland/Libraries/LibUnicode/Forward.h
@@ -11,6 +11,7 @@
namespace Unicode {
enum class Block : u16;
+enum class EmojiGroup : u8;
enum class GeneralCategory : u8;
enum class GraphemeBreakProperty : u8;
enum class Property : u8;
@@ -19,6 +20,7 @@ enum class SentenceBreakProperty : u8;
enum class WordBreakProperty : u8;
struct CurrencyCode;
+struct Emoji;
struct SpecialCasing;
}