diff options
author | Timothy Flynn <trflynn89@pm.me> | 2022-08-17 09:52:26 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2022-08-17 15:42:12 +0100 |
commit | ca92e37ae0b851ccda2c8c52a379fa5ca0c376b8 (patch) | |
tree | 5a4ccbfbf3553541b003f047c8133f4aeb9cb01d /Meta | |
parent | 2c2ede8581490b04b7598289aab08e5c837b194e (diff) | |
download | serenity-ca92e37ae0b851ccda2c8c52a379fa5ca0c376b8.zip |
LibUnicode: Generate code point display names with run-length encoding
Similar to commit becec35, our code point display name data was a large
list of StringViews. RLE can be used here as well to remove about 32 MB
from the initialized data section to the read-only section.
Some of the refactoring to store strings as indices into an RLE array
also lets us clean up some of the code point name generators.
Diffstat (limited to 'Meta')
-rw-r--r-- | Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp | 195 |
1 files changed, 114 insertions, 81 deletions
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 29067bb499..4330628d86 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -20,6 +20,9 @@ #include <LibCore/ArgsParser.h> #include <LibCore/Stream.h> +using StringIndexType = u16; +constexpr auto s_string_index_type = "u16"sv; + // Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code // points, as indicated by the "name" field. For example: // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; @@ -63,7 +66,7 @@ using NormalizationProps = HashMap<String, Vector<Normalization>>; struct CodePointName { CodePointRange code_point_range; - StringView name; + StringIndexType name { 0 }; }; // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt @@ -72,7 +75,7 @@ struct CodePointName { struct CodePointData { u32 code_point { 0 }; String name; - Optional<StringView> abbreviation; + Optional<StringIndexType> abbreviation; u8 canonical_combining_class { 0 }; String bidi_class; String decomposition_type; @@ -90,10 +93,12 @@ struct CodePointData { struct BlockName { CodePointRange code_point_range; - String name; + StringIndexType name { 0 }; }; struct UnicodeData { + UniqueStringStorage<StringIndexType> unique_strings; + u32 code_points_with_non_zero_combining_class { 0 }; u32 simple_uppercase_mapping_size { 0 }; @@ -107,8 +112,8 @@ struct UnicodeData { Vector<CodePointData> code_point_data; - HashMap<u32, String> code_point_abbreviations; - HashMap<u32, String> code_point_display_name_aliases; + HashMap<u32, StringIndexType> code_point_abbreviations; + HashMap<u32, StringIndexType> code_point_display_name_aliases; Vector<CodePointName> code_point_display_names; PropList general_categories; @@ -355,10 +360,13 @@ static ErrorOr<void> parse_name_aliases(Core::Stream::BufferedFile& file, Unicod auto reason = segments[2].trim_whitespace(); if (reason == "abbreviation"sv) { - unicode_data.code_point_abbreviations.set(*code_point, alias); + auto index = unicode_data.unique_strings.ensure(alias); + unicode_data.code_point_abbreviations.set(*code_point, index); } else if (reason.is_one_of("correction"sv, "control"sv)) { - if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) - unicode_data.code_point_display_name_aliases.set(*code_point, alias); + if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) { + auto index = unicode_data.unique_strings.ensure(alias); + unicode_data.code_point_display_name_aliases.set(*code_point, index); + } } } @@ -459,8 +467,13 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, // https://www.unicode.org/versions/Unicode14.0.0/ch04.pdf#G142981 // FIXME: Implement the NR1 rules for Hangul syllables. + struct CodePointNameFormat { + CodePointRange code_point_range; + StringView name; + }; + // These code point ranges are the NR2 set of name replacements defined by Table 4-8. - constexpr Array<CodePointName, 15> s_ideographic_replacements { { + constexpr Array<CodePointNameFormat, 15> s_ideographic_replacements { { { { 0x3400, 0x4DBF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv }, { { 0x4E00, 0x9FFC }, "CJK UNIFIED IDEOGRAPH-{:X}"sv }, { { 0xF900, 0xFA6D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv }, @@ -484,7 +497,8 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, }); if (it != s_ideographic_replacements.end()) { - unicode_data.code_point_display_names.append(*it); + auto index = unicode_data.unique_strings.ensure(it->name); + unicode_data.code_point_display_names.append({ it->code_point_range, index }); return; } @@ -505,7 +519,8 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, return; } - unicode_data.code_point_display_names.append({ range, name }); + auto index = unicode_data.unique_strings.ensure(name); + unicode_data.code_point_display_names.append({ range, index }); } static ErrorOr<void> parse_block_display_names(Core::Stream::BufferedFile& file, UnicodeData& unicode_data) @@ -521,7 +536,9 @@ static ErrorOr<void> parse_block_display_names(Core::Stream::BufferedFile& file, auto code_point_range = parse_code_point_range(segments[0].trim_whitespace()); auto display_name = segments[1].trim_whitespace(); - unicode_data.block_display_names.append({ code_point_range, display_name }); + + auto index = unicode_data.unique_strings.ensure(display_name); + unicode_data.block_display_names.append({ code_point_range, index }); } TRY(file.seek(0, Core::Stream::SeekMode::SetPosition)); @@ -713,6 +730,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered StringBuilder builder; SourceGenerator generator { builder }; + generator.set("string_index_type"sv, s_string_index_type); generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size)); generator.set("special_casing_size", String::number(unicode_data.special_casing.size())); @@ -730,6 +748,8 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered namespace Unicode { )~~~"); + unicode_data.unique_strings.generate(generator); + auto append_list_and_size = [&](auto const& list, StringView format) { if (list.is_empty()) { generator.append(", {}, 0"); @@ -784,7 +804,7 @@ struct SpecialCaseMapping { struct CodePointAbbreviation { u32 code_point { 0 }; - StringView abbreviation {}; + @string_index_type@ abbreviation { 0 }; }; template<typename MappingType> @@ -794,6 +814,37 @@ struct CodePointComparator { return code_point - mapping.code_point; } }; + +struct CodePointRangeComparator { + constexpr int operator()(u32 code_point, CodePointRange const& range) + { + return (code_point > range.last) - (code_point < range.first); + } +}; + +struct BlockNameData { + CodePointRange code_point_range {}; + @string_index_type@ display_name { 0 }; +}; + +struct BlockNameComparator : public CodePointRangeComparator { + constexpr int operator()(u32 code_point, BlockNameData const& name) + { + return CodePointRangeComparator::operator()(code_point, name.code_point_range); + } +}; + +struct CodePointName { + CodePointRange code_point_range {}; + @string_index_type@ display_name { 0 }; +}; + +struct CodePointNameComparator : public CodePointRangeComparator { + constexpr int operator()(u32 code_point, CodePointName const& name) + { + return CodePointRangeComparator::operator()(code_point, name.code_point_range); + } +}; )~~~"); auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) { @@ -825,12 +876,9 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { generator.set("code_point", String::formatted("{:#x}", data.code_point)); generator.append("{ @code_point@"); - if constexpr (IsSame<decltype(mapping), Optional<u32>>) { + if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<StringIndexType>>) { generator.set("mapping", String::formatted("{:#x}", *mapping)); generator.append(", @mapping@ },"); - } else if constexpr (IsSame<decltype(mapping), Optional<StringView>>) { - generator.set("mapping", String::formatted("{}", *mapping)); - generator.append(", \"@mapping@\"sv },"); } else { append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); generator.append(" },"); @@ -857,16 +905,6 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); - generator.append(R"~~~( -struct CodePointRangeComparator { - constexpr int operator()(u32 code_point, CodePointRange const& range) - { - return (code_point > range.last) - (code_point < range.first); - } -}; - -)~~~"); - auto append_code_point_range_list = [&](String name, Vector<CodePointRange> const& ranges) { generator.set("name", name); generator.set("size", String::number(ranges.size())); @@ -930,80 +968,73 @@ static constexpr Array<Span<CodePointRange const>, @size@> @name@ { {)~~~"); append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props); append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props); - generator.append(R"~~~( -struct BlockNameComparator : public CodePointRangeComparator { - constexpr int operator()(u32 code_point, BlockName const& name) - { - return CodePointRangeComparator::operator()(code_point, name.code_point_range); - } -}; -)~~~"); + auto append_code_point_display_names = [&](StringView type, StringView name, auto const& display_names) { + constexpr size_t max_values_per_row = 30; + size_t values_in_current_row = 0; - generator.set("block_display_names_size", String::number(unicode_data.block_display_names.size())); - generator.append(R"~~~( -static constexpr Array<BlockName, @block_display_names_size@> s_block_display_names { { -)~~~"); - for (auto const& block_name : unicode_data.block_display_names) { - generator.set("first", String::formatted("{:#x}", block_name.code_point_range.first)); - generator.set("last", String::formatted("{:#x}", block_name.code_point_range.last)); - generator.set("name", block_name.name); - generator.append(R"~~~( { { @first@, @last@ }, "@name@"sv }, -)~~~"); - } - generator.append(R"~~~(} }; + generator.set("type", type); + generator.set("name", name); + generator.set("size", String::number(display_names.size())); + + generator.append(R"~~~( +static constexpr Array<@type@, @size@> @name@ { { + )~~~"); + for (auto const& display_name : display_names) { + if (values_in_current_row++ > 0) + generator.append(", "); + + generator.set("first", String::formatted("{:#x}", display_name.code_point_range.first)); + generator.set("last", String::formatted("{:#x}", display_name.code_point_range.last)); + generator.set("name", String::number(display_name.name)); + generator.append("{ { @first@, @last@ }, @name@ }"); + + if (values_in_current_row == max_values_per_row) { + values_in_current_row = 0; + generator.append(",\n "); + } + } + generator.append(R"~~~( +} }; )~~~"); + }; + + append_code_point_display_names("BlockNameData"sv, "s_block_display_names"sv, unicode_data.block_display_names); + append_code_point_display_names("CodePointName"sv, "s_code_point_display_names"sv, unicode_data.code_point_display_names); generator.append(R"~~~( Optional<StringView> code_point_block_display_name(u32 code_point) { if (auto const* entry = binary_search(s_block_display_names, code_point, nullptr, BlockNameComparator {})) - return entry->display_name; + return decode_string(entry->display_name); return {}; } Span<BlockName const> block_display_names() { - return s_block_display_names; -} -)~~~"); + static auto display_names = []() { + Array<BlockName, s_block_display_names.size()> display_names; - generator.append(R"~~~( -struct CodePointName { - CodePointRange code_point_range {}; - StringView display_name; -}; + for (size_t i = 0; i < s_block_display_names.size(); ++i) { + auto const& display_name = s_block_display_names[i]; + display_names[i] = { display_name.code_point_range, decode_string(display_name.display_name) }; + } -struct CodePointNameComparator : public CodePointRangeComparator { - constexpr int operator()(u32 code_point, CodePointName const& name) - { - return CodePointRangeComparator::operator()(code_point, name.code_point_range); - } -}; -)~~~"); + return display_names; + }(); - generator.set("code_point_display_names_size", String::number(unicode_data.code_point_display_names.size())); - generator.append(R"~~~( -static constexpr Array<CodePointName, @code_point_display_names_size@> s_code_point_display_names { { -)~~~"); - for (auto const& code_point_name : unicode_data.code_point_display_names) { - generator.set("first", String::formatted("{:#x}", code_point_name.code_point_range.first)); - generator.set("last", String::formatted("{:#x}", code_point_name.code_point_range.last)); - generator.set("name", code_point_name.name); - generator.append(R"~~~( { { @first@, @last@ }, "@name@"sv }, -)~~~"); - } - generator.append(R"~~~(} }; -)~~~"); + return display_names.span(); +} - generator.append(R"~~~( Optional<String> code_point_display_name(u32 code_point) { if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) { - if (entry->display_name.ends_with("{:X}"sv)) - return String::formatted(entry->display_name, code_point); + auto display_name = decode_string(entry->display_name); - return entry->display_name; + if (display_name.ends_with("{:X}"sv)) + return String::formatted(display_name, code_point); + + return display_name; } return {}; @@ -1042,8 +1073,10 @@ Optional<StringView> code_point_abbreviation(u32 code_point) auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {}); if (mapping == nullptr) return {}; + if (mapping->abbreviation == 0) + return {}; - return mapping->abbreviation; + return decode_string(mapping->abbreviation); } )~~~"); |