1 files changed, 51 insertions, 3 deletions
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
index 5a50257ed3..986723cdda 100644
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -65,6 +65,7 @@ struct CodePointData {
     Optional<u32> simple_titlecase_mapping;
     Vector<u32> special_casing_indices;
     Vector<StringView> prop_list;
+    StringView word_break_property;
 };
 
 struct UnicodeData {
@@ -81,6 +82,8 @@ struct UnicodeData {
 
     PropList prop_list;
     u32 largest_prop_list_size { 0 };
+
+    PropList word_break_prop_list;
 };
 
 static constexpr auto s_desired_fields = Array {
@@ -152,7 +155,7 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
     quick_sort(unicode_data.conditions);
 }
 
-static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
+static void parse_prop_list(Core::File& file, PropList& prop_list)
 {
     while (file.can_read_line()) {
         auto line = file.read_line();
@@ -169,7 +172,7 @@ static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
         auto property = segments[1].trim_whitespace().to_string();
         property.replace("_", "", true);
 
-        auto& code_points = unicode_data.prop_list.ensure(property);
+        auto& code_points = prop_list.ensure(property);
 
         if (code_point_range.contains(".."sv)) {
             segments = code_point_range.split_view(".."sv);
@@ -253,6 +256,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
             }
         }
 
+        for (auto const& property : unicode_data.word_break_prop_list) {
+            for (auto const& range : property.value) {
+                if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
+                    data.word_break_property = property.key;
+                    break;
+                }
+            }
+            if (!data.word_break_property.is_empty())
+                break;
+        }
+        if (data.word_break_property.is_empty())
+            data.word_break_property = "Other"sv;
+
         unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
         unicode_data.largest_prop_list_size = max(unicode_data.largest_prop_list_size, data.prop_list.size());
 
@@ -333,6 +349,21 @@ enum class Property {)~~~");
     generator.append(R"~~~(
 };
 
+enum class WordBreakProperty {
+    Other,)~~~");
+
+    properties = unicode_data.word_break_prop_list.keys();
+    quick_sort(properties);
+
+    for (auto const& property : properties) {
+        generator.set("property", property);
+        generator.append(R"~~~(
+    @property@,)~~~");
+    }
+
+    generator.append(R"~~~(
+};
+
 struct SpecialCasing {
     u32 code_point { 0 };
 
@@ -385,6 +416,8 @@ struct UnicodeData {
 
     Property prop_list[@prop_list_size@] {};
     u32 prop_list_size { 0 };
+
+    WordBreakProperty word_break_property { WordBreakProperty::Other };
 };
 
 Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
@@ -484,6 +517,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
         append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point)));
         append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
         append_list_and_size(data.prop_list, "Property::{}"sv);
+        generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
         generator.append(" },");
     }
 
@@ -540,6 +574,7 @@ int main(int argc, char** argv)
     char const* unicode_data_path = nullptr;
     char const* special_casing_path = nullptr;
     char const* prop_list_path = nullptr;
+    char const* word_break_path = nullptr;
 
     Core::ArgsParser args_parser;
     args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h');
@@ -547,6 +582,7 @@ int main(int argc, char** argv)
     args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
     args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
     args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
+    args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
     args_parser.parse(argc, argv);
 
     if (!generate_header && !generate_implementation) {
@@ -569,6 +605,11 @@ int main(int argc, char** argv)
         args_parser.print_usage(stderr, argv[0]);
         return 1;
     }
+    if (!word_break_path) {
+        warnln("-w/--word-break-path is required");
+        args_parser.print_usage(stderr, argv[0]);
+        return 1;
+    }
 
     auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly);
     if (unicode_data_file_or_error.is_error()) {
@@ -588,9 +629,16 @@ int main(int argc, char** argv)
         return 1;
     }
 
+    auto word_break_file_or_error = Core::File::open(word_break_path, Core::OpenMode::ReadOnly);
+    if (word_break_file_or_error.is_error()) {
+        warnln("Failed to open {}: {}", word_break_path, word_break_file_or_error.release_error());
+        return 1;
+    }
+
     UnicodeData unicode_data {};
     parse_special_casing(special_casing_file_or_error.value(), unicode_data);
-    parse_prop_list(prop_list_file_or_error.value(), unicode_data);
+    parse_prop_list(prop_list_file_or_error.value(), unicode_data.prop_list);
+    parse_prop_list(word_break_file_or_error.value(), unicode_data.word_break_prop_list);
     parse_unicode_data(unicode_data_file_or_error.value(), unicode_data);
 
     if (generate_header)