diff options
author | Timothy Flynn <trflynn89@pm.me> | 2022-09-07 13:39:31 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2022-09-08 23:12:31 +0100 |
commit | b61eca0a1e401202e8315629c3d87158381c9518 (patch) | |
tree | dda2f0ef4e7938707652e2d24b72510ecdfbb467 | |
parent | fff79379d4270ece49cce85e67d09f0e5ab7315d (diff) | |
download | serenity-b61eca0a1e401202e8315629c3d87158381c9518.zip |
LibUncode: Parse and generate emoji code point data
According to TR #51, the "best definition of the full set [of emojis] is
in the emoji-test.txt file". This defines not only the emoji themselves,
but the order in which they should be displayed, and what "group" of
emojis they belong to.
-rw-r--r-- | Meta/CMake/unicode_data.cmake | 17 | ||||
-rw-r--r-- | Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp | 220 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Emoji.cpp | 13 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Emoji.h | 97 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/Forward.h | 2 |
7 files changed, 351 insertions, 0 deletions
diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 88e9a9b64e..2751035068 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -92,12 +92,18 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h) set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp) + set(EMOJI_DATA_HEADER LibUnicode/EmojiData.h) + set(EMOJI_DATA_IMPLEMENTATION LibUnicode/EmojiData.cpp) + set(UNICODE_META_TARGET_PREFIX LibUnicode_) if (CMAKE_CURRENT_BINARY_DIR MATCHES ".*/LibUnicode") # Serenity build. set(UNICODE_DATA_HEADER UnicodeData.h) set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp) + set(EMOJI_DATA_HEADER EmojiData.h) + set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp) + set(UNICODE_META_TARGET_PREFIX "") endif() @@ -110,6 +116,15 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${UNICODE_DATA_IMPLEMENTATION}" arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" ) + invoke_generator( + "EmojiData" + Lagom::GenerateEmojiData + "${UCD_VERSION_FILE}" + "${UNICODE_META_TARGET_PREFIX}" + "${EMOJI_DATA_HEADER}" + "${EMOJI_DATA_IMPLEMENTATION}" + arguments -e "${EMOJI_TEST_PATH}" + ) if (CMAKE_CURRENT_BINARY_DIR MATCHES ".*/LibUnicode") # Serenity build. add_custom_command( @@ -128,5 +143,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION} + ${EMOJI_DATA_HEADER} + ${EMOJI_DATA_IMPLEMENTATION} ) endif() diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt index 281c97f413..b18637a184 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt @@ -1 +1,2 @@ lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain) +lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp new file mode 100644 index 0000000000..667d324c05 --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include "GeneratorUtil.h" +#include <AK/SourceGenerator.h> +#include <AK/String.h> +#include <AK/StringUtils.h> +#include <AK/Types.h> +#include <LibCore/ArgsParser.h> +#include <LibCore/Stream.h> +#include <LibUnicode/Emoji.h> + +using StringIndexType = u16; +constexpr auto s_string_index_type = "u16"sv; + +struct Emoji { + StringIndexType name { 0 }; + Unicode::EmojiGroup group; + u32 display_order { 0 }; + String code_points_name; + Vector<u32> code_points; +}; + +struct EmojiData { + UniqueStringStorage<StringIndexType> unique_strings; + Vector<Emoji> emojis; +}; + +static ErrorOr<void> parse_emoji_test_data(Core::Stream::BufferedFile& file, EmojiData& emoji_data) +{ + static constexpr auto group_header = "# group: "sv; + + Array<u8, 1024> buffer; + + Unicode::EmojiGroup group; + u32 display_order { 0 }; + + while (TRY(file.can_read_line())) { + auto line = TRY(file.read_line(buffer)); + if (line.is_empty()) + continue; + + if (line.starts_with('#')) { + if (line.starts_with(group_header)) { + auto name = line.substring_view(group_header.length()); + group = Unicode::emoji_group_from_string(name); + } + + continue; + } + + auto status_index = line.find(';'); + VERIFY(status_index.has_value()); + + auto emoji_and_name_index = line.find('#', *status_index); + VERIFY(emoji_and_name_index.has_value()); + + // FIXME: Should we keep non-fully-qualified emoji? TR #51 states this is implementation defined. + auto status = line.substring_view(*status_index + 1, *emoji_and_name_index - *status_index - 1).trim_whitespace(); + if (status != "fully-qualified"sv) + continue; + + Emoji emoji {}; + emoji.group = group; + emoji.display_order = display_order++; + + auto code_points = line.substring_view(0, *status_index).split_view(' '); + TRY(emoji.code_points.try_ensure_capacity(code_points.size())); + + for (auto code_point : code_points) { + auto value = AK::StringUtils::convert_to_uint_from_hex<u32>(code_point); + VERIFY(value.has_value()); + + emoji.code_points.unchecked_append(*value); + } + + auto emoji_and_name = line.substring_view(*emoji_and_name_index + 1); + + auto emoji_and_name_spaces = emoji_and_name.find_all(" "sv); + VERIFY(emoji_and_name_spaces.size() > 2); + + auto name = emoji_and_name.substring_view(emoji_and_name_spaces[2]).trim_whitespace(); + emoji.name = emoji_data.unique_strings.ensure(name.to_titlecase_string()); + emoji.code_points_name = String::join('_', code_points); + + TRY(emoji_data.emojis.try_append(move(emoji))); + } + + return {}; +} + +static ErrorOr<void> generate_emoji_data_header(Core::Stream::BufferedFile& file, EmojiData const&) +{ + StringBuilder builder; + SourceGenerator generator { builder }; + + TRY(file.write(generator.as_string_view().bytes())); + return {}; +} + +static ErrorOr<void> generate_emoji_data_implementation(Core::Stream::BufferedFile& file, EmojiData const& emoji_data) +{ + StringBuilder builder; + SourceGenerator generator { builder }; + + generator.set("string_index_type"sv, s_string_index_type); + generator.set("emojis_size"sv, String::number(emoji_data.emojis.size())); + + generator.append(R"~~~( +#include <AK/Array.h> +#include <AK/BinarySearch.h> +#include <AK/Span.h> +#include <AK/StringView.h> +#include <AK/Types.h> +#include <LibUnicode/Emoji.h> +#include <LibUnicode/EmojiData.h> + +namespace Unicode { +)~~~"); + + emoji_data.unique_strings.generate(generator); + + generator.append(R"~~~( +struct EmojiData { + constexpr Emoji to_unicode_emoji() const + { + Emoji emoji {}; + emoji.name = decode_string(name); + emoji.group = static_cast<EmojiGroup>(group); + emoji.display_order = display_order; + emoji.code_points = code_points; + + return emoji; + } + + @string_index_type@ name { 0 }; + u8 group { 0 }; + u32 display_order { 0 }; + Span<u32 const> code_points; +}; +)~~~"); + + for (auto const& emoji : emoji_data.emojis) { + generator.set("name"sv, emoji.code_points_name); + generator.set("size"sv, String::number(emoji.code_points.size())); + + generator.append(R"~~~( +static constexpr Array<u32, @size@> s_@name@ { {)~~~"); + + bool first = true; + for (auto code_point : emoji.code_points) { + generator.append(first ? " "sv : ", "sv); + generator.append(String::formatted("{:#x}", code_point)); + first = false; + } + + generator.append(" } };"sv); + } + + generator.append(R"~~~( + +static constexpr Array<EmojiData, @emojis_size@> s_emojis { {)~~~"); + + for (auto const& emoji : emoji_data.emojis) { + generator.set("name"sv, String::number(emoji.name)); + generator.set("group"sv, String::number(to_underlying(emoji.group))); + generator.set("display_order"sv, String::number(emoji.display_order)); + generator.set("code_points_name"sv, emoji.code_points_name); + + generator.append(R"~~~( + { @name@, @group@, @display_order@, s_@code_points_name@ },)~~~"); + } + + generator.append(R"~~~( +} }; + +Optional<Emoji> find_emoji_for_code_points(Span<u32 const> code_points) +{ + for (auto& emoji : s_emojis) { + if (emoji.code_points == code_points) + return emoji.to_unicode_emoji(); + } + + return {}; +} + +} +)~~~"); + + TRY(file.write(generator.as_string_view().bytes())); + return {}; +} + +ErrorOr<int> serenity_main(Main::Arguments arguments) +{ + StringView generated_header_path; + StringView generated_implementation_path; + StringView emoji_test_path; + + Core::ArgsParser args_parser; + args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); + args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); + args_parser.add_option(emoji_test_path, "Path to emoji-test.txt file", "emoji-test-path", 'e', "emoji-test-path"); + args_parser.parse(arguments); + + auto generated_header_file = TRY(open_file(generated_header_path, Core::Stream::OpenMode::Write)); + auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::Stream::OpenMode::Write)); + auto emoji_test_file = TRY(open_file(emoji_test_path, Core::Stream::OpenMode::Read)); + + EmojiData emoji_data {}; + TRY(parse_emoji_test_data(*emoji_test_file, emoji_data)); + + TRY(generate_emoji_data_header(*generated_header_file, emoji_data)); + TRY(generate_emoji_data_implementation(*generated_implementation_file, emoji_data)); + + return 0; +} diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 1bacfa8375..c6210ab217 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -3,6 +3,7 @@ include(${SerenityOS_SOURCE_DIR}/Meta/CMake/unicode_data.cmake) set(SOURCES CharacterTypes.cpp CurrencyCode.cpp + Emoji.cpp ${UNICODE_DATA_SOURCES} ) diff --git a/Userland/Libraries/LibUnicode/Emoji.cpp b/Userland/Libraries/LibUnicode/Emoji.cpp new file mode 100644 index 0000000000..ce0974b9a8 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Emoji.cpp @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <LibUnicode/Emoji.h> + +namespace Unicode { + +Optional<Emoji> __attribute__((weak)) find_emoji_for_code_points(Span<u32 const>) { return {}; } + +} diff --git a/Userland/Libraries/LibUnicode/Emoji.h b/Userland/Libraries/LibUnicode/Emoji.h new file mode 100644 index 0000000000..ca5d81b839 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Emoji.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/Optional.h> +#include <AK/StringView.h> +#include <AK/Types.h> + +namespace Unicode { + +enum class EmojiGroup : u8 { + SmileysAndEmotion, + PeopleAndBody, + Component, + AnimalsAndNature, + FoodAndDrink, + TravelAndPlaces, + Activities, + Objects, + Symbols, + Flags, +}; + +struct Emoji { + StringView name; + EmojiGroup group; + u32 display_order { 0 }; + Span<u32 const> code_points; +}; + +Optional<Emoji> find_emoji_for_code_points(Span<u32 const> code_points); + +template<size_t Size> +Optional<Emoji> find_emoji_for_code_points(u32 const (&code_points)[Size]) +{ + return find_emoji_for_code_points(Span<u32 const> { code_points }); +} + +constexpr StringView emoji_group_to_string(EmojiGroup group) +{ + switch (group) { + case EmojiGroup::SmileysAndEmotion: + return "Smileys & Emotion"sv; + case EmojiGroup::PeopleAndBody: + return "People & Body"sv; + case EmojiGroup::Component: + return "Component"sv; + case EmojiGroup::AnimalsAndNature: + return "Animals & Nature"sv; + case EmojiGroup::FoodAndDrink: + return "Food & Drink"sv; + case EmojiGroup::TravelAndPlaces: + return "Travel & Places"sv; + case EmojiGroup::Activities: + return "Activities"sv; + case EmojiGroup::Objects: + return "Objects"sv; + case EmojiGroup::Symbols: + return "Symbols"sv; + case EmojiGroup::Flags: + return "Flags"sv; + } + + VERIFY_NOT_REACHED(); +} + +constexpr EmojiGroup emoji_group_from_string(StringView group) +{ + if (group == "Smileys & Emotion"sv) + return EmojiGroup::SmileysAndEmotion; + if (group == "People & Body"sv) + return EmojiGroup::PeopleAndBody; + if (group == "Component"sv) + return EmojiGroup::Component; + if (group == "Animals & Nature"sv) + return EmojiGroup::AnimalsAndNature; + if (group == "Food & Drink"sv) + return EmojiGroup::FoodAndDrink; + if (group == "Travel & Places"sv) + return EmojiGroup::TravelAndPlaces; + if (group == "Activities"sv) + return EmojiGroup::Activities; + if (group == "Objects"sv) + return EmojiGroup::Objects; + if (group == "Symbols"sv) + return EmojiGroup::Symbols; + if (group == "Flags"sv) + return EmojiGroup::Flags; + + VERIFY_NOT_REACHED(); +} + +} diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index c3b08d3088..22cf7c698e 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -11,6 +11,7 @@ namespace Unicode { enum class Block : u16; +enum class EmojiGroup : u8; enum class GeneralCategory : u8; enum class GraphemeBreakProperty : u8; enum class Property : u8; @@ -19,6 +20,7 @@ enum class SentenceBreakProperty : u8; enum class WordBreakProperty : u8; struct CurrencyCode; +struct Emoji; struct SpecialCasing; } |