summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibUnicode/CodeGenerators
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-07-27 18:24:39 -0400
committerAndreas Kling <kling@serenityos.org>2021-07-28 23:42:29 +0200
commit12fb3ae0334e282233fc00b211ffab8622d37a38 (patch)
tree6ff94a2c29eb25e45a5be782269d65f78476e259 /Userland/Libraries/LibUnicode/CodeGenerators
parentc45a014645649380e4f0928fba52a07cc1a147a8 (diff)
downloadserenity-12fb3ae0334e282233fc00b211ffab8622d37a38.zip
LibUnicode: Download and parse the word break property list UCD file
Note that unlike the main property list, each code point has only one word break property. Code points that do not have a word break property are to be assigned the property "Other".
Diffstat (limited to 'Userland/Libraries/LibUnicode/CodeGenerators')
-rw-r--r--Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp54
1 files changed, 51 insertions, 3 deletions
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
index 5a50257ed3..986723cdda 100644
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -65,6 +65,7 @@ struct CodePointData {
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
Vector<StringView> prop_list;
+ StringView word_break_property;
};
struct UnicodeData {
@@ -81,6 +82,8 @@ struct UnicodeData {
PropList prop_list;
u32 largest_prop_list_size { 0 };
+
+ PropList word_break_prop_list;
};
static constexpr auto s_desired_fields = Array {
@@ -152,7 +155,7 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
quick_sort(unicode_data.conditions);
}
-static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
+static void parse_prop_list(Core::File& file, PropList& prop_list)
{
while (file.can_read_line()) {
auto line = file.read_line();
@@ -169,7 +172,7 @@ static void parse_prop_list(Core::File& file, UnicodeData& unicode_data)
auto property = segments[1].trim_whitespace().to_string();
property.replace("_", "", true);
- auto& code_points = unicode_data.prop_list.ensure(property);
+ auto& code_points = prop_list.ensure(property);
if (code_point_range.contains(".."sv)) {
segments = code_point_range.split_view(".."sv);
@@ -253,6 +256,19 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
}
}
+ for (auto const& property : unicode_data.word_break_prop_list) {
+ for (auto const& range : property.value) {
+ if ((range.first <= data.code_point) && (data.code_point <= range.last)) {
+ data.word_break_property = property.key;
+ break;
+ }
+ }
+ if (!data.word_break_property.is_empty())
+ break;
+ }
+ if (data.word_break_property.is_empty())
+ data.word_break_property = "Other"sv;
+
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
unicode_data.largest_prop_list_size = max(unicode_data.largest_prop_list_size, data.prop_list.size());
@@ -333,6 +349,21 @@ enum class Property {)~~~");
generator.append(R"~~~(
};
+enum class WordBreakProperty {
+ Other,)~~~");
+
+ properties = unicode_data.word_break_prop_list.keys();
+ quick_sort(properties);
+
+ for (auto const& property : properties) {
+ generator.set("property", property);
+ generator.append(R"~~~(
+ @property@,)~~~");
+ }
+
+ generator.append(R"~~~(
+};
+
struct SpecialCasing {
u32 code_point { 0 };
@@ -385,6 +416,8 @@ struct UnicodeData {
Property prop_list[@prop_list_size@] {};
u32 prop_list_size { 0 };
+
+ WordBreakProperty word_break_property { WordBreakProperty::Other };
};
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
@@ -484,6 +517,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point)));
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
append_list_and_size(data.prop_list, "Property::{}"sv);
+ generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
generator.append(" },");
}
@@ -540,6 +574,7 @@ int main(int argc, char** argv)
char const* unicode_data_path = nullptr;
char const* special_casing_path = nullptr;
char const* prop_list_path = nullptr;
+ char const* word_break_path = nullptr;
Core::ArgsParser args_parser;
args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h');
@@ -547,6 +582,7 @@ int main(int argc, char** argv)
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
+ args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
args_parser.parse(argc, argv);
if (!generate_header && !generate_implementation) {
@@ -569,6 +605,11 @@ int main(int argc, char** argv)
args_parser.print_usage(stderr, argv[0]);
return 1;
}
+ if (!word_break_path) {
+ warnln("-w/--word-break-path is required");
+ args_parser.print_usage(stderr, argv[0]);
+ return 1;
+ }
auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly);
if (unicode_data_file_or_error.is_error()) {
@@ -588,9 +629,16 @@ int main(int argc, char** argv)
return 1;
}
+ auto word_break_file_or_error = Core::File::open(word_break_path, Core::OpenMode::ReadOnly);
+ if (word_break_file_or_error.is_error()) {
+ warnln("Failed to open {}: {}", word_break_path, word_break_file_or_error.release_error());
+ return 1;
+ }
+
UnicodeData unicode_data {};
parse_special_casing(special_casing_file_or_error.value(), unicode_data);
- parse_prop_list(prop_list_file_or_error.value(), unicode_data);
+ parse_prop_list(prop_list_file_or_error.value(), unicode_data.prop_list);
+ parse_prop_list(word_break_file_or_error.value(), unicode_data.word_break_prop_list);
parse_unicode_data(unicode_data_file_or_error.value(), unicode_data);
if (generate_header)