diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-07-27 18:24:39 -0400 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-07-28 23:42:29 +0200 |
commit | 12fb3ae0334e282233fc00b211ffab8622d37a38 (patch) | |
tree | 6ff94a2c29eb25e45a5be782269d65f78476e259 /Userland/Libraries/LibUnicode | |
parent | c45a014645649380e4f0928fba52a07cc1a147a8 (diff) | |
download | serenity-12fb3ae0334e282233fc00b211ffab8622d37a38.zip |
LibUnicode: Download and parse the word break property list UCD file
Note that unlike the main property list, each code point has only one
word break property. Code points that do not have a word break property
are to be assigned the property "Other".
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r-- | Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp | 54 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/unicode_data.cmake | 11 |
2 files changed, 60 insertions, 5 deletions
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 5a50257ed3..986723cdda 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -65,6 +65,7 @@ struct CodePointData { Optional<u32> simple_titlecase_mapping; Vector<u32> special_casing_indices; Vector<StringView> prop_list; + StringView word_break_property; }; struct UnicodeData { @@ -81,6 +82,8 @@ struct UnicodeData { PropList prop_list; u32 largest_prop_list_size { 0 }; + + PropList word_break_prop_list; }; static constexpr auto s_desired_fields = Array { @@ -152,7 +155,7 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data) quick_sort(unicode_data.conditions); } -static void parse_prop_list(Core::File& file, UnicodeData& unicode_data) +static void parse_prop_list(Core::File& file, PropList& prop_list) { while (file.can_read_line()) { auto line = file.read_line(); @@ -169,7 +172,7 @@ static void parse_prop_list(Core::File& file, UnicodeData& unicode_data) auto property = segments[1].trim_whitespace().to_string(); property.replace("_", "", true); - auto& code_points = unicode_data.prop_list.ensure(property); + auto& code_points = prop_list.ensure(property); if (code_point_range.contains(".."sv)) { segments = code_point_range.split_view(".."sv); @@ -253,6 +256,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) } } + for (auto const& property : unicode_data.word_break_prop_list) { + for (auto const& range : property.value) { + if ((range.first <= data.code_point) && (data.code_point <= range.last)) { + data.word_break_property = property.key; + break; + } + } + if (!data.word_break_property.is_empty()) + break; + } + if (data.word_break_property.is_empty()) + data.word_break_property = "Other"sv; + unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); unicode_data.largest_prop_list_size = max(unicode_data.largest_prop_list_size, data.prop_list.size()); @@ -333,6 +349,21 @@ enum class Property {)~~~"); generator.append(R"~~~( }; +enum class WordBreakProperty { + Other,)~~~"); + + properties = unicode_data.word_break_prop_list.keys(); + quick_sort(properties); + + for (auto const& property : properties) { + generator.set("property", property); + generator.append(R"~~~( + @property@,)~~~"); + } + + generator.append(R"~~~( +}; + struct SpecialCasing { u32 code_point { 0 }; @@ -385,6 +416,8 @@ struct UnicodeData { Property prop_list[@prop_list_size@] {}; u32 prop_list_size { 0 }; + + WordBreakProperty word_break_property { WordBreakProperty::Other }; }; Optional<UnicodeData> unicode_data_for_code_point(u32 code_point); @@ -484,6 +517,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~ append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point))); append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); append_list_and_size(data.prop_list, "Property::{}"sv); + generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property)); generator.append(" },"); } @@ -540,6 +574,7 @@ int main(int argc, char** argv) char const* unicode_data_path = nullptr; char const* special_casing_path = nullptr; char const* prop_list_path = nullptr; + char const* word_break_path = nullptr; Core::ArgsParser args_parser; args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h'); @@ -547,6 +582,7 @@ int main(int argc, char** argv) args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path"); args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path"); + args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path"); args_parser.parse(argc, argv); if (!generate_header && !generate_implementation) { @@ -569,6 +605,11 @@ int main(int argc, char** argv) args_parser.print_usage(stderr, argv[0]); return 1; } + if (!word_break_path) { + warnln("-w/--word-break-path is required"); + args_parser.print_usage(stderr, argv[0]); + return 1; + } auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly); if (unicode_data_file_or_error.is_error()) { @@ -588,9 +629,16 @@ int main(int argc, char** argv) return 1; } + auto word_break_file_or_error = Core::File::open(word_break_path, Core::OpenMode::ReadOnly); + if (word_break_file_or_error.is_error()) { + warnln("Failed to open {}: {}", word_break_path, word_break_file_or_error.release_error()); + return 1; + } + UnicodeData unicode_data {}; parse_special_casing(special_casing_file_or_error.value(), unicode_data); - parse_prop_list(prop_list_file_or_error.value(), unicode_data); + parse_prop_list(prop_list_file_or_error.value(), unicode_data.prop_list); + parse_prop_list(word_break_file_or_error.value(), unicode_data.word_break_prop_list); parse_unicode_data(unicode_data_file_or_error.value(), unicode_data); if (generate_header) diff --git a/Userland/Libraries/LibUnicode/unicode_data.cmake b/Userland/Libraries/LibUnicode/unicode_data.cmake index 8b8a10ed47..5197a75209 100644 --- a/Userland/Libraries/LibUnicode/unicode_data.cmake +++ b/Userland/Libraries/LibUnicode/unicode_data.cmake @@ -9,6 +9,9 @@ set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt) set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt) set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt) +set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt) +set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt) + if (ENABLE_UNICODE_DATABASE_DOWNLOAD) if (NOT EXISTS ${UNICODE_DATA_PATH}) message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...") @@ -22,6 +25,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...") file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10) endif() + if (NOT EXISTS ${WORD_BREAK_PATH}) + message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...") + file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10) + endif() set(UNICODE_GENERATOR CodeGenerators/GenerateUnicodeData) set(UNICODE_DATA_HEADER UnicodeData.h) @@ -39,7 +46,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_HEADER} - COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} + COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH} VERBATIM DEPENDS GenerateUnicodeData MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} @@ -47,7 +54,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_IMPLEMENTATION} - COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} + COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH} VERBATIM DEPENDS GenerateUnicodeData MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} |