diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-07-27 17:37:47 -0400 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-07-28 23:42:29 +0200 |
commit | 38adfd8874354a077f359de5fa272dd56bc78984 (patch) | |
tree | 6de5281f52cea72b030275933652809c9fdc54d3 /Userland/Libraries/LibUnicode | |
parent | c69d8b69250e3939caa9623b522410f09609e316 (diff) | |
download | serenity-38adfd8874354a077f359de5fa272dd56bc78984.zip |
LibUnicode: Download and parse the property list UCD file
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r-- | Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp | 123 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/unicode_data.cmake | 11 |
2 files changed, 113 insertions, 21 deletions
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 1e98584247..5a50257ed3 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -7,6 +7,7 @@ #include <AK/AllOf.h> #include <AK/Array.h> #include <AK/CharacterTypes.h> +#include <AK/HashMap.h> #include <AK/Optional.h> #include <AK/QuickSort.h> #include <AK/SourceGenerator.h> @@ -17,6 +18,16 @@ #include <LibCore/ArgsParser.h> #include <LibCore/File.h> +// Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code +// points, as indicated by the "name" field. For example: +// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; +// 4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; +struct CodePointRange { + u32 index; + u32 first; + u32 last; +}; + // SpecialCasing source: https://www.unicode.org/Public/13.0.0/ucd/SpecialCasing.txt // Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#SpecialCasing.txt struct SpecialCasing { @@ -29,6 +40,10 @@ struct SpecialCasing { String condition; }; +// PropList source: https://www.unicode.org/Public/13.0.0/ucd/PropList.txt +// Property descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#PropList.txt +using PropList = HashMap<String, Vector<CodePointRange>>; + // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt // Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#UnicodeData.txt struct CodePointData { @@ -49,16 +64,7 @@ struct CodePointData { Optional<u32> simple_lowercase_mapping; Optional<u32> simple_titlecase_mapping; Vector<u32> special_casing_indices; -}; - -// Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code -// points, as indicated by the "name" field. For example: -// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; -// 4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; -struct CodePointRange { - u32 index; - u32 first; - u32 last; + Vector<StringView> prop_list; }; struct UnicodeData { @@ -72,6 +78,9 @@ struct UnicodeData { Vector<CodePointRange> code_point_ranges; Vector<String> general_categories; u32 last_contiguous_code_point { 0 }; + + PropList prop_list; + u32 largest_prop_list_size { 0 }; }; static constexpr auto s_desired_fields = Array { @@ -143,6 +152,39 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data) quick_sort(unicode_data.conditions); } +static void parse_prop_list(Core::File& file, UnicodeData& unicode_data) +{ + while (file.can_read_line()) { + auto line = file.read_line(); + if (line.is_empty() || line.starts_with('#')) + continue; + + if (auto index = line.find('#'); index.has_value()) + line = line.substring(0, *index); + + auto segments = line.split_view(';', true); + VERIFY(segments.size() == 2); + + auto code_point_range = segments[0].trim_whitespace(); + auto property = segments[1].trim_whitespace().to_string(); + property.replace("_", "", true); + + auto& code_points = unicode_data.prop_list.ensure(property); + + if (code_point_range.contains(".."sv)) { + segments = code_point_range.split_view(".."sv); + VERIFY(segments.size() == 2); + + auto begin = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value(); + auto end = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[1]).value(); + code_points.append({ 0, begin, end }); + } else { + auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(code_point_range).value(); + code_points.append({ 0, code_point, code_point }); + } + } +} + static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) { Optional<u32> code_point_range_start; @@ -202,7 +244,17 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) data.special_casing_indices.append(casing.index); } + for (auto const& property : unicode_data.prop_list) { + for (auto const& range : property.value) { + if ((range.first <= data.code_point) && (data.code_point <= range.last)) { + data.prop_list.append(property.key); + break; + } + } + } + unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); + unicode_data.largest_prop_list_size = max(unicode_data.largest_prop_list_size, data.prop_list.size()); if (!unicode_data.general_categories.contains_slow(data.general_category)) unicode_data.general_categories.append(data.general_category); @@ -221,6 +273,7 @@ static void generate_unicode_data_header(UnicodeData const& unicode_data) SourceGenerator generator { builder }; generator.set("casing_transform_size", String::number(unicode_data.largest_casing_transform_size)); generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size)); + generator.set("prop_list_size", String::number(unicode_data.largest_prop_list_size)); generator.append(R"~~~( #pragma once @@ -266,6 +319,20 @@ enum class GeneralCategory {)~~~"); generator.append(R"~~~( }; +enum class Property {)~~~"); + + auto properties = unicode_data.prop_list.keys(); + quick_sort(properties); + + for (auto const& property : properties) { + generator.set("property", property); + generator.append(R"~~~( + @property@,)~~~"); + } + + generator.append(R"~~~( +}; + struct SpecialCasing { u32 code_point { 0 }; @@ -315,6 +382,9 @@ struct UnicodeData { SpecialCasing const* special_casing[@special_casing_size@] {}; u32 special_casing_size { 0 }; + + Property prop_list[@prop_list_size@] {}; + u32 prop_list_size { 0 }; }; Optional<UnicodeData> unicode_data_for_code_point(u32 code_point); @@ -342,20 +412,20 @@ static void generate_unicode_data_implementation(UnicodeData unicode_data) namespace Unicode { )~~~"); - auto append_numeric_list = [&](auto const& code_points, StringView format) { - if (code_points.is_empty()) { + auto append_list_and_size = [&](auto const& list, StringView format) { + if (list.is_empty()) { generator.append(", {}, 0"); return; } bool first = true; generator.append(", {"); - for (auto code_point : code_points) { + for (auto const& item : list) { generator.append(first ? " " : ", "); - generator.append(String::formatted(format, code_point)); + generator.append(String::formatted(format, item)); first = false; } - generator.append(String::formatted(" }}, {}", code_points.size())); + generator.append(String::formatted(" }}, {}", list.size())); }; generator.append(R"~~~( @@ -367,9 +437,9 @@ static constexpr Array<SpecialCasing, @special_casing_size@> s_special_casing { { @code_point@)~~~"); constexpr auto format = "0x{:x}"sv; - append_numeric_list(casing.lowercase_mapping, format); - append_numeric_list(casing.uppercase_mapping, format); - append_numeric_list(casing.titlecase_mapping, format); + append_list_and_size(casing.lowercase_mapping, format); + append_list_and_size(casing.uppercase_mapping, format); + append_list_and_size(casing.titlecase_mapping, format); generator.set("locale", casing.locale.is_empty() ? "None" : casing.locale); generator.append(", Locale::@locale@"); @@ -412,7 +482,8 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~ append_field("simple_uppercase_mapping", String::formatted("{:#x}", data.simple_uppercase_mapping.value_or(data.code_point))); append_field("simple_lowercase_mapping", String::formatted("{:#x}", data.simple_lowercase_mapping.value_or(data.code_point))); append_field("simple_titlecase_mapping", String::formatted("{:#x}", data.simple_titlecase_mapping.value_or(data.code_point))); - append_numeric_list(data.special_casing_indices, "&s_special_casing[{}]"sv); + append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); + append_list_and_size(data.prop_list, "Property::{}"sv); generator.append(" },"); } @@ -468,12 +539,14 @@ int main(int argc, char** argv) bool generate_implementation = false; char const* unicode_data_path = nullptr; char const* special_casing_path = nullptr; + char const* prop_list_path = nullptr; Core::ArgsParser args_parser; args_parser.add_option(generate_header, "Generate the Unicode Data header file", "generate-header", 'h'); args_parser.add_option(generate_implementation, "Generate the Unicode Data implementation file", "generate-implementation", 'c'); args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path"); + args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path"); args_parser.parse(argc, argv); if (!generate_header && !generate_implementation) { @@ -491,6 +564,11 @@ int main(int argc, char** argv) args_parser.print_usage(stderr, argv[0]); return 1; } + if (!prop_list_path) { + warnln("-p/--prop-list-path is required"); + args_parser.print_usage(stderr, argv[0]); + return 1; + } auto unicode_data_file_or_error = Core::File::open(unicode_data_path, Core::OpenMode::ReadOnly); if (unicode_data_file_or_error.is_error()) { @@ -504,8 +582,15 @@ int main(int argc, char** argv) return 1; } + auto prop_list_file_or_error = Core::File::open(prop_list_path, Core::OpenMode::ReadOnly); + if (prop_list_file_or_error.is_error()) { + warnln("Failed to open {}: {}", prop_list_path, prop_list_file_or_error.release_error()); + return 1; + } + UnicodeData unicode_data {}; parse_special_casing(special_casing_file_or_error.value(), unicode_data); + parse_prop_list(prop_list_file_or_error.value(), unicode_data); parse_unicode_data(unicode_data_file_or_error.value(), unicode_data); if (generate_header) diff --git a/Userland/Libraries/LibUnicode/unicode_data.cmake b/Userland/Libraries/LibUnicode/unicode_data.cmake index fc8e46c29b..8b8a10ed47 100644 --- a/Userland/Libraries/LibUnicode/unicode_data.cmake +++ b/Userland/Libraries/LibUnicode/unicode_data.cmake @@ -6,6 +6,9 @@ set(UNICODE_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/UnicodeData.txt) set(SPECIAL_CASING_URL https://www.unicode.org/Public/13.0.0/ucd/SpecialCasing.txt) set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt) +set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt) +set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt) + if (ENABLE_UNICODE_DATABASE_DOWNLOAD) if (NOT EXISTS ${UNICODE_DATA_PATH}) message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...") @@ -15,6 +18,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) message(STATUS "Downloading UCD SpecialCasing.txt from ${SPECIAL_CASING_URL}...") file(DOWNLOAD ${SPECIAL_CASING_URL} ${SPECIAL_CASING_PATH} INACTIVITY_TIMEOUT 10) endif() + if (NOT EXISTS ${PROP_LIST_PATH}) + message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...") + file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10) + endif() set(UNICODE_GENERATOR CodeGenerators/GenerateUnicodeData) set(UNICODE_DATA_HEADER UnicodeData.h) @@ -32,7 +39,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_HEADER} - COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} + COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} ${UNICODE_GENERATOR} -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} VERBATIM DEPENDS GenerateUnicodeData MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} @@ -40,7 +47,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_IMPLEMENTATION} - COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} + COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} ${UNICODE_GENERATOR} -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} VERBATIM DEPENDS GenerateUnicodeData MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} |