LibUnicode: Reduce Unicode data generator boilerplate

There's a fair amount of boilerplate when e.g. adding a new UCD file to parse or a new enumeration to generate. Reduce the overhead by adding helper lambdas. Also adds a couple missing spec links with UCD field information.
author: Timothy Flynn <trflynn89@pm.me> 2021-07-27 22:21:53 -0400
committer: Andreas Kling <kling@serenityos.org> 2021-07-28 23:42:29 +0200
commit: dff156b7c679624734f54f1aa0e2cd26d78a8a56 (patch)
tree: a4d1cf8375f481910b5ccd116de96d001bf6a18c /Userland/Libraries/LibUnicode/CodeGenerators
parent: 7827aede6fd8f113e95f7d7225d77039accd4cfa (diff)
download: serenity-dff156b7c679624734f54f1aa0e2cd26d78a8a56.zip
1 files changed, 47 insertions, 108 deletions
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
index 986723cdda..7b0c6f0fcb 100644
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -42,10 +42,12 @@ struct SpecialCasing {
 
 // PropList source: https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
 // Property descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#PropList.txt
+//                        https://www.unicode.org/reports/tr44/tr44-13.html#WordBreakProperty.txt
 using PropList = HashMap<String, Vector<CodePointRange>>;
 
 // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
 // Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#UnicodeData.txt
+//                     https://www.unicode.org/reports/tr44/#General_Category_Values
 struct CodePointData {
     u32 index { 0 };
     u32 code_point { 0 };
@@ -150,9 +152,6 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
 
         unicode_data.special_casing.append(move(casing));
     }
-
-    quick_sort(unicode_data.locales);
-    quick_sort(unicode_data.conditions);
 }
 
 static void parse_prop_list(Core::File& file, PropList& prop_list)
@@ -279,11 +278,10 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
         unicode_data.code_point_data.append(move(data));
     }
 
-    quick_sort(unicode_data.general_categories);
     unicode_data.last_contiguous_code_point = *last_contiguous_code_point;
 }
 
-static void generate_unicode_data_header(UnicodeData const& unicode_data)
+static void generate_unicode_data_header(UnicodeData& unicode_data)
 {
     StringBuilder builder;
     SourceGenerator generator { builder };
@@ -291,79 +289,43 @@ static void generate_unicode_data_header(UnicodeData const& unicode_data)
     generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
     generator.set("prop_list_size", String::number(unicode_data.largest_prop_list_size));
 
-    generator.append(R"~~~(
-#pragma once
-
-#include <AK/Optional.h>
-#include <AK/Types.h>
-
-namespace Unicode {
-
-enum class Locale {
-    None,)~~~");
-
-    for (auto const& locale : unicode_data.locales) {
-        generator.set("locale", locale);
-        generator.append(R"~~~(
-    @locale@,)~~~");
-    }
-
-    generator.append(R"~~~(
-};
-
-enum class Condition {
-    None,)~~~");
+    auto generate_enum = [&](StringView name, StringView default_, Vector<String> values) {
+        quick_sort(values);
 
-    for (auto const& condition : unicode_data.conditions) {
-        generator.set("condition", condition);
+        generator.set("name", name);
         generator.append(R"~~~(
-    @condition@,)~~~");
-    }
+enum class @name@ {)~~~");
 
-    generator.append(R"~~~(
-};
+        if (!default_.is_empty())
+            values.prepend(default_);
 
-// https://www.unicode.org/reports/tr44/#General_Category_Values
-enum class GeneralCategory {)~~~");
+        for (auto const& value : values) {
+            generator.set("value", value);
+            generator.append(R"~~~(
+    @value@,)~~~");
+        }
 
-    for (auto const& general_category : unicode_data.general_categories) {
-        generator.set("general_category", general_category);
         generator.append(R"~~~(
-    @general_category@,)~~~");
-    }
-
-    generator.append(R"~~~(
 };
-
-enum class Property {)~~~");
-
-    auto properties = unicode_data.prop_list.keys();
-    quick_sort(properties);
-
-    for (auto const& property : properties) {
-        generator.set("property", property);
-        generator.append(R"~~~(
-    @property@,)~~~");
-    }
+)~~~");
+    };
 
     generator.append(R"~~~(
-};
+#pragma once
 
-enum class WordBreakProperty {
-    Other,)~~~");
+#include <AK/Optional.h>
+#include <AK/Types.h>
 
-    properties = unicode_data.word_break_prop_list.keys();
-    quick_sort(properties);
+namespace Unicode {
+)~~~");
 
-    for (auto const& property : properties) {
-        generator.set("property", property);
-        generator.append(R"~~~(
-    @property@,)~~~");
-    }
+    generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
+    generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
+    generate_enum("GeneralCategory"sv, {}, move(unicode_data.general_categories));
+    generate_enum("Property"sv, {}, unicode_data.prop_list.keys());
+    generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
 
     generator.append(R"~~~(
-};
-
 struct SpecialCasing {
     u32 code_point { 0 };
 
@@ -590,56 +552,33 @@ int main(int argc, char** argv)
         args_parser.print_usage(stderr, argv[0]);
         return 1;
     }
-    if (!unicode_data_path) {
-        warnln("-u/--unicode-data-path is required");
-        args_parser.print_usage(stderr, argv[0]);
-        return 1;
-    }
-    if (!special_casing_path) {
-        warnln("-s/--special-casing-path is required");
-        args_parser.print_usage(stderr, argv[0]);
-        return 1;
-    }
-    if (!prop_list_path) {
-        warnln("-p/--prop-list-path is required");
-        args_parser.print_usage(stderr, argv[0]);
-        return 1;
-    }
-    if (!word_break_path) {
-        warnln("-w/--word-break-path is required");
-        args_parser.print_usage(stderr, argv[0]);
-        return 1;
-    }
 
-    auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly);
-    if (unicode_data_file_or_error.is_error()) {
-        warnln("Failed to open {}: {}", unicode_data_path, unicode_data_file_or_error.release_error());
-        return 1;
-    }
+    auto open_file = [&](StringView path, StringView flags) {
+        if (path.is_empty()) {
+            warnln("{} is required", flags);
+            args_parser.print_usage(stderr, argv[0]);
+            exit(1);
+        }
 
-    auto special_casing_file_or_error = Core::File::open(special_casing_path, Core::OpenMode::ReadOnly);
-    if (special_casing_file_or_error.is_error()) {
-        warnln("Failed to open {}: {}", special_casing_path, special_casing_file_or_error.release_error());
-        return 1;
-    }
+        auto file_or_error = Core::File::open(path, Core::OpenMode::ReadOnly);
+        if (file_or_error.is_error()) {
+            warnln("Failed to open {}: {}", path, file_or_error.release_error());
+            exit(1);
+        }
 
-    auto prop_list_file_or_error = Core::File::open(prop_list_path, Core::OpenMode::ReadOnly);
-    if (prop_list_file_or_error.is_error()) {
-        warnln("Failed to open {}: {}", prop_list_path, prop_list_file_or_error.release_error());
-        return 1;
-    }
+        return file_or_error.release_value();
+    };
 
-    auto word_break_file_or_error = Core::File::open(word_break_path, Core::OpenMode::ReadOnly);
-    if (word_break_file_or_error.is_error()) {
-        warnln("Failed to open {}: {}", word_break_path, word_break_file_or_error.release_error());
-        return 1;
-    }
+    auto unicode_data_file = open_file(unicode_data_path, "-u/--unicode-data-path");
+    auto special_casing_file = open_file(special_casing_path, "-s/--special-casing-path");
+    auto prop_list_file = open_file(prop_list_path, "-p/--prop-list-path");
+    auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
 
     UnicodeData unicode_data {};
-    parse_special_casing(special_casing_file_or_error.value(), unicode_data);
-    parse_prop_list(prop_list_file_or_error.value(), unicode_data.prop_list);
-    parse_prop_list(word_break_file_or_error.value(), unicode_data.word_break_prop_list);
-    parse_unicode_data(unicode_data_file_or_error.value(), unicode_data);
+    parse_special_casing(special_casing_file, unicode_data);
+    parse_prop_list(prop_list_file, unicode_data.prop_list);
+    parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
+    parse_unicode_data(unicode_data_file, unicode_data);
 
     if (generate_header)
         generate_unicode_data_header(unicode_data);
author	Timothy Flynn <trflynn89@pm.me>	2021-07-27 22:21:53 -0400
committer	Andreas Kling <kling@serenityos.org>	2021-07-28 23:42:29 +0200
commit	dff156b7c679624734f54f1aa0e2cd26d78a8a56 (patch)
tree	a4d1cf8375f481910b5ccd116de96d001bf6a18c /Userland/Libraries/LibUnicode/CodeGenerators
parent	7827aede6fd8f113e95f7d7225d77039accd4cfa (diff)
download	serenity-dff156b7c679624734f54f1aa0e2cd26d78a8a56.zip