LibUnicode: Download and parse the word break property list UCD file

Note that unlike the main property list, each code point has only one word break property. Code points that do not have a word break property are to be assigned the property "Other".
author: Timothy Flynn <trflynn89@pm.me> 2021-07-27 18:24:39 -0400
committer: Andreas Kling <kling@serenityos.org> 2021-07-28 23:42:29 +0200
commit: 12fb3ae0334e282233fc00b211ffab8622d37a38 (patch)
tree: 6ff94a2c29eb25e45a5be782269d65f78476e259 /Userland/Libraries/LibUnicode
parent: c45a014645649380e4f0928fba52a07cc1a147a8 (diff)
download: serenity-12fb3ae0334e282233fc00b211ffab8622d37a38.zip
2 files changed, 60 insertions, 5 deletions
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
index 5a50257ed3..986723cdda 100644
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -65,6 +65,7 @@ struct CodePointData {
     Optional<u32> simple_titlecase_mapping;
     Vector<u32> special_casing_indices;
     Vector<StringView> prop_list;
+    StringView word_break_property;
 };
 
 struct UnicodeData {
@@ -81,6 +82,8 @@ struct UnicodeData {
 
     PropList prop_list;
     u32 largest_prop_list_size { 0 };
+
+    PropList word_break_prop_list;
 };
 
 static constexpr auto s_desired_fields = Array {
@@ -152,7 +155,7 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
     quick_sort(unicode_data.conditions);
 }
 
-static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
+static void parse_prop_list(Core::File& file, PropList& prop_list)
 {
     while (file.can_read_line()) {
         auto line = file.read_line();
@@ -169,7 +172,7 @@ static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
         auto property = segments[1].trim_whitespace().to_string();
         property.replace("_", "", true);
 
-        auto& code_points = unicode_data.prop_list.ensure(property);
+        auto& code_points = prop_list.ensure(property);
 
         if (code_point_range.contains(".."sv)) {
             segments = code_point_range.split_view(".."sv);
@@ -253,6 +256,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
             }
         }
 
+        for (auto const& property : unicode_data.word_break_prop_list) {
+            for (auto const& range : property.value) {
+                if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
+                    data.word_break_property = property.key;
+                    break;
+                }
+            }
+            if (!data.word_break_property.is_empty())
+                break;
+        }
+        if (data.word_break_property.is_empty())
+            data.word_break_property = "Other"sv;
+
         unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
         unicode_data.largest_prop_list_size = max(unicode_data.largest_prop_list_size, data.prop_list.size());
 
@@ -333,6 +349,21 @@ enum class Property {)~~~");
     generator.append(R"~~~(
 };
 
+enum class WordBreakProperty {
+    Other,)~~~");
+
+    properties = unicode_data.word_break_prop_list.keys();
+    quick_sort(properties);
+
+    for (auto const& property : properties) {
+        generator.set("property", property);
+        generator.append(R"~~~(
+    @property@,)~~~");
+    }
+
+    generator.append(R"~~~(
+};
+
 struct SpecialCasing {
     u32 code_point { 0 };
 
@@ -385,6 +416,8 @@ struct UnicodeData {
 
     Property prop_list[@prop_list_size@] {};
     u32 prop_list_size { 0 };
+
+    WordBreakProperty word_break_property { WordBreakProperty::Other };
 };
 
 Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
@@ -484,6 +517,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
         append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point)));
         append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
         append_list_and_size(data.prop_list, "Property::{}"sv);
+        generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
         generator.append(" },");
     }
 
@@ -540,6 +574,7 @@ int main(int argc, char** argv)
     char const* unicode_data_path = nullptr;
     char const* special_casing_path = nullptr;
     char const* prop_list_path = nullptr;
+    char const* word_break_path = nullptr;
 
     Core::ArgsParser args_parser;
     args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h');
@@ -547,6 +582,7 @@ int main(int argc, char** argv)
     args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
     args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
     args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
+    args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
     args_parser.parse(argc, argv);
 
     if (!generate_header && !generate_implementation) {
@@ -569,6 +605,11 @@ int main(int argc, char** argv)
         args_parser.print_usage(stderr, argv[0]);
         return 1;
     }
+    if (!word_break_path) {
+        warnln("-w/--word-break-path is required");
+        args_parser.print_usage(stderr, argv[0]);
+        return 1;
+    }
 
     auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly);
     if (unicode_data_file_or_error.is_error()) {
@@ -588,9 +629,16 @@ int main(int argc, char** argv)
         return 1;
     }
 
+    auto word_break_file_or_error = Core::File::open(word_break_path, Core::OpenMode::ReadOnly);
+    if (word_break_file_or_error.is_error()) {
+        warnln("Failed to open {}: {}", word_break_path, word_break_file_or_error.release_error());
+        return 1;
+    }
+
     UnicodeData unicode_data {};
     parse_special_casing(special_casing_file_or_error.value(), unicode_data);
-    parse_prop_list(prop_list_file_or_error.value(), unicode_data);
+    parse_prop_list(prop_list_file_or_error.value(), unicode_data.prop_list);
+    parse_prop_list(word_break_file_or_error.value(), unicode_data.word_break_prop_list);
     parse_unicode_data(unicode_data_file_or_error.value(), unicode_data);
 
     if (generate_header)
diff --git a/Userland/Libraries/LibUnicode/unicode_data.cmake b/Userland/Libraries/LibUnicode/unicode_data.cmake
index 8b8a10ed47..5197a75209 100644
--- a/Userland/Libraries/LibUnicode/unicode_data.cmake
+++ b/Userland/Libraries/LibUnicode/unicode_data.cmake
@@ -9,6 +9,9 @@ set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt)
 set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt)
 set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt)
 
+set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
+set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
+
 if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
     if (NOT EXISTS ${UNICODE_DATA_PATH})
         message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...")
@@ -22,6 +25,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
         message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...")
         file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10)
     endif()
+    if (NOT EXISTS ${WORD_BREAK_PATH})
+        message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
+        file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
+    endif()
 
     set(UNICODE_GENERATOR CodeGenerators/GenerateUnicodeData)
     set(UNICODE_DATA_HEADER UnicodeData.h)
@@ -39,7 +46,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
 
     add_custom_command(
         OUTPUT ${UNICODE_DATA_HEADER}
-        COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH}
+        COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH}
         VERBATIM
         DEPENDS GenerateUnicodeData
         MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
@@ -47,7 +54,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
 
     add_custom_command(
         OUTPUT ${UNICODE_DATA_IMPLEMENTATION}
-        COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH}
+        COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH}
         VERBATIM
         DEPENDS GenerateUnicodeData
         MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
author	Timothy Flynn <trflynn89@pm.me>	2021-07-27 18:24:39 -0400
committer	Andreas Kling <kling@serenityos.org>	2021-07-28 23:42:29 +0200
commit	12fb3ae0334e282233fc00b211ffab8622d37a38 (patch)
tree	6ff94a2c29eb25e45a5be782269d65f78476e259 /Userland/Libraries/LibUnicode
parent	c45a014645649380e4f0928fba52a07cc1a147a8 (diff)
download	serenity-12fb3ae0334e282233fc00b211ffab8622d37a38.zip