From 9113f892a77237df2504d5a994dd4aec81434909 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 4 Aug 2021 07:46:36 -0400 Subject: LibUnicode: Parse UCD emoji-data.txt and generate Unicode property --- Userland/Libraries/LibUnicode/CharacterTypes.cpp | 12 ++++++------ .../LibUnicode/CodeGenerators/GenerateUnicodeData.cpp | 4 ++++ Userland/Libraries/LibUnicode/unicode_data.cmake | 11 +++++++++-- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'Userland/Libraries/LibUnicode') diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 114cfb1c61..cf866f2ebc 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -275,12 +275,12 @@ bool is_ecma262_property([[maybe_unused]] Property property) case Unicode::Property::Default_Ignorable_Code_Point: case Unicode::Property::Deprecated: case Unicode::Property::Diacritic: - // case Unicode::Property::Emoji: - // case Unicode::Property::Emoji_Component: - // case Unicode::Property::Emoji_Modifier: - // case Unicode::Property::Emoji_Modifier_Base: - // case Unicode::Property::Emoji_Presentation: - // case Unicode::Property::Extended_Pictographic: + case Unicode::Property::Emoji: + case Unicode::Property::Emoji_Component: + case Unicode::Property::Emoji_Modifier: + case Unicode::Property::Emoji_Modifier_Base: + case Unicode::Property::Emoji_Presentation: + case Unicode::Property::Extended_Pictographic: case Unicode::Property::Extender: case Unicode::Property::Grapheme_Base: case Unicode::Property::Grapheme_Extend: diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 6d3b30308c..619ed6ebd1 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -828,6 +828,7 @@ int main(int argc, char** argv) char const* scripts_path = nullptr; char const* script_extensions_path = nullptr; char const* word_break_path = nullptr; + char const* emoji_data_path = nullptr; Core::ArgsParser args_parser; args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); @@ -841,6 +842,7 @@ int main(int argc, char** argv) args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path"); args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path"); args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path"); + args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path"); args_parser.parse(argc, argv); auto open_file = [&](StringView path, StringView flags, Core::OpenMode mode = Core::OpenMode::ReadOnly) { @@ -870,11 +872,13 @@ int main(int argc, char** argv) auto scripts_file = open_file(scripts_path, "-r/--scripts-path"); auto script_extensions_file = open_file(script_extensions_path, "-x/--script-extensions-path"); auto word_break_file = open_file(word_break_path, "-w/--word-break-path"); + auto emoji_data_file = open_file(emoji_data_path, "-e/--emoji-data-path"); UnicodeData unicode_data {}; parse_special_casing(special_casing_file, unicode_data); parse_prop_list(prop_list_file, unicode_data.prop_list); parse_prop_list(derived_core_prop_file, unicode_data.prop_list); + parse_prop_list(emoji_data_file, unicode_data.prop_list); parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases); parse_prop_list(scripts_file, unicode_data.script_list); parse_prop_list(script_extensions_file, unicode_data.script_extensions, true); diff --git a/Userland/Libraries/LibUnicode/unicode_data.cmake b/Userland/Libraries/LibUnicode/unicode_data.cmake index dbaeb907b1..2c547654c5 100644 --- a/Userland/Libraries/LibUnicode/unicode_data.cmake +++ b/Userland/Libraries/LibUnicode/unicode_data.cmake @@ -27,6 +27,9 @@ set(SCRIPT_EXTENSIONS_PATH ${CMAKE_BINARY_DIR}/UCD/ScriptExtensions.txt) set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt) set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt) +set(EMOJI_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt) +set(EMOJI_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/emoji-data.txt) + if (ENABLE_UNICODE_DATABASE_DOWNLOAD) if (NOT EXISTS ${UNICODE_DATA_PATH}) message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...") @@ -64,6 +67,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...") file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10) endif() + if (NOT EXISTS ${EMOJI_DATA_PATH}) + message(STATUS "Downloading UCD emoji-data.txt from ${EMOJI_DATA_URL}...") + file(DOWNLOAD ${EMOJI_DATA_URL} ${EMOJI_DATA_PATH} INACTIVITY_TIMEOUT 10) + endif() set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h) set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp) @@ -75,9 +82,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION} - COMMAND $ -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -w ${WORD_BREAK_PATH} + COMMAND $ -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -w ${WORD_BREAK_PATH} -e ${EMOJI_DATA_PATH} VERBATIM - DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${WORD_BREAK_PATH} + DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${WORD_BREAK_PATH} ${EMOJI_DATA_PATH} ) set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}) -- cgit v1.2.3