LibUnicode: Download and parse the word break property list UCD file

Note that unlike the main property list, each code point has only one word break property. Code points that do not have a word break property are to be assigned the property "Other".
author: Timothy Flynn <trflynn89@pm.me> 2021-07-27 18:24:39 -0400
committer: Andreas Kling <kling@serenityos.org> 2021-07-28 23:42:29 +0200
commit: 12fb3ae0334e282233fc00b211ffab8622d37a38 (patch)
tree: 6ff94a2c29eb25e45a5be782269d65f78476e259 /Userland/Libraries/LibUnicode/CodeGenerators
parent: c45a014645649380e4f0928fba52a07cc1a147a8 (diff)
download: serenity-12fb3ae0334e282233fc00b211ffab8622d37a38.zip
1 files changed, 51 insertions, 3 deletions
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
index 5a50257ed3..986723cdda 100644
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -65,6 +65,7 @@ struct CodePointData {
     Optional<u32> simple_titlecase_mapping;
     Vector<u32> special_casing_indices;
     Vector<StringView> prop_list;
+    StringView word_break_property;
 };
 
 struct UnicodeData {
@@ -81,6 +82,8 @@ struct UnicodeData {
 
     PropList prop_list;
     u32 largest_prop_list_size { 0 };
+
+    PropList word_break_prop_list;
 };
 
 static constexpr auto s_desired_fields = Array {
@@ -152,7 +155,7 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
     quick_sort(unicode_data.conditions);
 }
 
-static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
+static void parse_prop_list(Core::File& file, PropList& prop_list)
 {
     while (file.can_read_line()) {
         auto line = file.read_line();
@@ -169,7 +172,7 @@ static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
         auto property = segments[1].trim_whitespace().to_string();
         property.replace("_", "", true);
 
-        auto& code_points = unicode_data.prop_list.ensure(property);
+        auto& code_points = prop_list.ensure(property);
 
         if (code_point_range.contains(".."sv)) {
             segments = code_point_range.split_view(".."sv);
@@ -253,6 +256,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
             }
         }
 
+        for (auto const& property : unicode_data.word_break_prop_list) {
+            for (auto const& range : property.value) {
+                if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
+                    data.word_break_property = property.key;
+                    break;
+                }
+            }
+            if (!data.word_break_property.is_empty())
+                break;
+        }
+        if (data.word_break_property.is_empty())
+            data.word_break_property = "Other"sv;
+
         unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
         unicode_data.largest_prop_list_size = max(unicode_data.largest_prop_list_size, data.prop_list.size());
 
@@ -333,6 +349,21 @@ enum class Property {)~~~");
     generator.append(R"~~~(
 };
 
+enum class WordBreakProperty {
+    Other,)~~~");
+
+    properties = unicode_data.word_break_prop_list.keys();
+    quick_sort(properties);
+
+    for (auto const& property : properties) {
+        generator.set("property", property);
+        generator.append(R"~~~(
+    @property@,)~~~");
+    }
+
+    generator.append(R"~~~(
+};
+
 struct SpecialCasing {
     u32 code_point { 0 };
 
@@ -385,6 +416,8 @@ struct UnicodeData {
 
     Property prop_list[@prop_list_size@] {};
     u32 prop_list_size { 0 };
+
+    WordBreakProperty word_break_property { WordBreakProperty::Other };
 };
 
 Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
@@ -484,6 +517,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
         append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point)));
         append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
         append_list_and_size(data.prop_list, "Property::{}"sv);
+        generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
         generator.append(" },");
     }
 
@@ -540,6 +574,7 @@ int main(int argc, char** argv)
     char const* unicode_data_path = nullptr;
     char const* special_casing_path = nullptr;
     char const* prop_list_path = nullptr;
+    char const* word_break_path = nullptr;
 
     Core::ArgsParser args_parser;
     args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h');
@@ -547,6 +582,7 @@ int main(int argc, char** argv)
     args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
     args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
     args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
+    args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
     args_parser.parse(argc, argv);
 
     if (!generate_header && !generate_implementation) {
@@ -569,6 +605,11 @@ int main(int argc, char** argv)
         args_parser.print_usage(stderr, argv[0]);
         return 1;
     }
+    if (!word_break_path) {
+        warnln("-w/--word-break-path is required");
+        args_parser.print_usage(stderr, argv[0]);
+        return 1;
+    }
 
     auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly);
     if (unicode_data_file_or_error.is_error()) {
@@ -588,9 +629,16 @@ int main(int argc, char** argv)
         return 1;
     }
 
+    auto word_break_file_or_error = Core::File::open(word_break_path, Core::OpenMode::ReadOnly);
+    if (word_break_file_or_error.is_error()) {
+        warnln("Failed to open {}: {}", word_break_path, word_break_file_or_error.release_error());
+        return 1;
+    }
+
     UnicodeData unicode_data {};
     parse_special_casing(special_casing_file_or_error.value(), unicode_data);
-    parse_prop_list(prop_list_file_or_error.value(), unicode_data);
+    parse_prop_list(prop_list_file_or_error.value(), unicode_data.prop_list);
+    parse_prop_list(word_break_file_or_error.value(), unicode_data.word_break_prop_list);
     parse_unicode_data(unicode_data_file_or_error.value(), unicode_data);
 
     if (generate_header)
author	Timothy Flynn <trflynn89@pm.me>	2021-07-27 18:24:39 -0400
committer	Andreas Kling <kling@serenityos.org>	2021-07-28 23:42:29 +0200
commit	12fb3ae0334e282233fc00b211ffab8622d37a38 (patch)
tree	6ff94a2c29eb25e45a5be782269d65f78476e259 /Userland/Libraries/LibUnicode/CodeGenerators
parent	c45a014645649380e4f0928fba52a07cc1a147a8 (diff)
download	serenity-12fb3ae0334e282233fc00b211ffab8622d37a38.zip