diff options
-rw-r--r-- | Meta/Lagom/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp | 38 | ||||
-rw-r--r-- | Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h | 54 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/CharacterTypes.cpp | 125 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/UnicodeSymbols.cpp | 94 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/UnicodeSymbols.h | 44 |
7 files changed, 256 insertions, 101 deletions
diff --git a/Meta/Lagom/CMakeLists.txt b/Meta/Lagom/CMakeLists.txt index 174110e07d..6fe3516a01 100644 --- a/Meta/Lagom/CMakeLists.txt +++ b/Meta/Lagom/CMakeLists.txt @@ -427,6 +427,7 @@ if (BUILD_LAGOM) SOURCES ${LIBUNICODE_SOURCES} ${UNICODE_DATA_SOURCES} ) target_compile_definitions(LagomUnicode PRIVATE ENABLE_UNICODE_DATA=$<BOOL:${ENABLE_UNICODE_DATABASE_DOWNLOAD}>) + target_link_libraries(LagomUnicode -ldl) # WASM file(GLOB LIBWASM_SOURCES CONFIGURE_DEPENDS "../../Userland/Libraries/LibWasm/*/*.cpp") diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index c34ecf4b10..53cc9fb2e1 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -574,8 +574,6 @@ enum class @name@ : @underlying@ {)~~~"); generator.append(R"~~~( #pragma once -#include <AK/Optional.h> -#include <AK/Span.h> #include <AK/Types.h> #include <LibUnicode/Forward.h> #include <LibUnicode/UnicodeLocale.h> @@ -605,28 +603,6 @@ struct SpecialCasing { Condition condition { Condition::None }; }; -namespace Detail { - -Optional<String> code_point_display_name(u32 code_point); - -u32 canonical_combining_class(u32 code_point); - -u32 simple_uppercase_mapping(u32 code_point); -u32 simple_lowercase_mapping(u32 code_point); -Span<SpecialCasing const* const> special_case_mapping(u32 code_point); - -bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); -Optional<GeneralCategory> general_category_from_string(StringView general_category); - -bool code_point_has_property(u32 code_point, Property property); -Optional<Property> property_from_string(StringView property); - -bool code_point_has_script(u32 code_point, Script script); -bool code_point_has_script_extension(u32 code_point, Script script); -Optional<Script> script_from_string(StringView script); - -} - } )~~~"); @@ -645,11 +621,13 @@ static void generate_unicode_data_implementation(Core::File& file, UnicodeData c #include <AK/Array.h> #include <AK/BinarySearch.h> #include <AK/CharacterTypes.h> +#include <AK/Optional.h> +#include <AK/Span.h> #include <AK/String.h> #include <AK/StringView.h> #include <LibUnicode/UnicodeData.h> -namespace Unicode { +namespace Unicode::Detail { )~~~"); auto append_list_and_size = [&](auto const& list, StringView format) { @@ -873,8 +851,7 @@ static constexpr Array<CodePointName, @code_point_display_names_size@> s_code_po )~~~"); generator.append(R"~~~( -namespace Detail { - +Optional<String> code_point_display_name(u32 code_point) asm("unicode_code_point_display_name"); Optional<String> code_point_display_name(u32 code_point) { if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) { @@ -893,6 +870,7 @@ Optional<String> code_point_display_name(u32 code_point) generator.set("mappings", mappings); generator.set("fallback", fallback); generator.append(R"~~~( +u32 @method@(u32 code_point) asm("unicode_@method@"); u32 @method@(u32 code_point) { auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator<CodePointMapping> {}); @@ -906,6 +884,7 @@ u32 @method@(u32 code_point) append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv, "code_point"sv); generator.append(R"~~~( +Span<SpecialCasing const* const> special_case_mapping(u32 code_point) asm("unicode_special_case_mapping"); Span<SpecialCasing const* const> special_case_mapping(u32 code_point) { auto const* mapping = binary_search(s_special_case_mappings, code_point, nullptr, CodePointComparator<SpecialCaseMapping> {}); @@ -921,6 +900,7 @@ Span<SpecialCasing const* const> special_case_mapping(u32 code_point) generator.set("enum_snake", enum_snake); generator.set("collection_name", collection_name); generator.append(R"~~~( +bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) asm("unicode_code_point_has_@enum_snake@"); bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) { auto index = static_cast<@enum_title@UnderlyingType>(@enum_snake@); @@ -941,7 +921,7 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) for (auto const& alias : aliases) hashes.set(alias.alias.hash(), alias.alias); - generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes)); + generate_value_from_string_for_dynamic_loading(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes)); }; append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv); @@ -956,8 +936,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) generator.append(R"~~~( } - -} )~~~"); VERIFY(file.write(generator.as_string_view())); diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h index 74621fbbc2..e4b21bb814 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h @@ -345,6 +345,60 @@ Optional<@return_type@> @method_name@(StringView key) )~~~"); } +// This is a temporary duplicate of generate_value_from_string() until all generators support dynamic loading. +template<typename ValueType> +void generate_value_from_string_for_dynamic_loading(SourceGenerator& generator, StringView method_name_format, StringView value_type, StringView value_name, HashValueMap<ValueType> hashes, Optional<StringView> return_type = {}, StringView return_format = "{}"sv) +{ + ensure_from_string_types_are_generated(generator); + + generator.set("method_name", String::formatted(method_name_format, value_name)); + generator.set("value_type", value_type); + generator.set("value_name", value_name); + generator.set("return_type", return_type.has_value() ? *return_type : value_type); + generator.set("size", String::number(hashes.size())); + + generator.append(R"~~~( +Optional<@return_type@> @method_name@(StringView key) asm("unicode_@method_name@"); +Optional<@return_type@> @method_name@(StringView key) +{ + constexpr Array<HashValuePair<@value_type@>, @size@> hash_pairs { { + )~~~"); + + auto hash_keys = hashes.keys(); + quick_sort(hash_keys); + + constexpr size_t max_values_per_row = 10; + size_t values_in_current_row = 0; + + for (auto hash_key : hash_keys) { + if (values_in_current_row++ > 0) + generator.append(" "); + + if constexpr (IsIntegral<ValueType>) + generator.set("value"sv, String::number(hashes.get(hash_key).value())); + else + generator.set("value"sv, String::formatted("{}::{}", value_type, hashes.get(hash_key).value())); + + generator.set("hash"sv, String::number(hash_key)); + generator.append("{ @hash@U, @value@ },"sv); + + if (values_in_current_row == max_values_per_row) { + generator.append("\n "); + values_in_current_row = 0; + } + } + + generator.set("return_statement", String::formatted(return_format, "value->value"sv)); + generator.append(R"~~~( + } }; + + if (auto const* value = binary_search(hash_pairs, key.hash(), nullptr, HashValueComparator<@value_type@> {})) + return @return_statement@; + return {}; +} +)~~~"); +} + template<typename IdentifierFormatter> void generate_enum(SourceGenerator& generator, IdentifierFormatter&& format_identifier, StringView name, StringView default_, Vector<String>& values, Vector<Alias> aliases = {}) { diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 6634f4b504..9249cdc373 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -13,6 +13,7 @@ set(SOURCES DateTimeFormat.cpp Locale.cpp NumberFormat.cpp + UnicodeSymbols.cpp ) serenity_lib(LibUnicode unicode) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 520f94fc4b..c394650e2c 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -11,6 +11,7 @@ #include <AK/Utf8View.h> #include <LibUnicode/CharacterTypes.h> #include <LibUnicode/Locale.h> +#include <LibUnicode/UnicodeSymbols.h> #if ENABLE_UNICODE_DATA # include <LibUnicode/UnicodeData.h> @@ -23,6 +24,18 @@ namespace Unicode { #if ENABLE_UNICODE_DATA +static u32 canonical_combining_class(u32 code_point) +{ + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.canonical_combining_class(code_point); +} + +static Span<Unicode::SpecialCasing const* const> special_case_mapping(u32 code_point) +{ + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.special_case_mapping(code_point); +} + static bool is_after_uppercase_i(Utf8View const& string, size_t index) { // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0. @@ -36,11 +49,11 @@ static bool is_after_uppercase_i(Utf8View const& string, size_t index) continue; } - u32 canonical_combining_class = Detail::canonical_combining_class(code_point); + u32 combining_class = canonical_combining_class(code_point); - if (canonical_combining_class == 0) + if (combining_class == 0) found_uppercase_i = false; - else if (canonical_combining_class == 230) + else if (combining_class == 230) found_uppercase_i = false; } @@ -60,11 +73,11 @@ static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index continue; } - u32 canonical_combining_class = Detail::canonical_combining_class(code_point); + u32 combining_class = canonical_combining_class(code_point); - if (canonical_combining_class == 0) + if (combining_class == 0) found_soft_dotted_code_point = false; - else if (canonical_combining_class == 230) + else if (combining_class == 230) found_soft_dotted_code_point = false; } @@ -119,11 +132,11 @@ static bool is_followed_by_combining_class_above(Utf8View const& string, size_t : Utf8View {}; for (auto code_point : following_view) { - u32 canonical_combining_class = Detail::canonical_combining_class(code_point); + u32 combining_class = canonical_combining_class(code_point); - if (canonical_combining_class == 0) + if (combining_class == 0) return false; - if (canonical_combining_class == 230) + if (combining_class == 230) return true; } @@ -142,11 +155,11 @@ static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t in if (code_point == 0x307) return true; - u32 canonical_combining_class = Detail::canonical_combining_class(code_point); + u32 combining_class = canonical_combining_class(code_point); - if (canonical_combining_class == 0) + if (combining_class == 0) return false; - if (canonical_combining_class == 230) + if (combining_class == 230) return false; } @@ -162,7 +175,7 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View requested_locale = *maybe_locale; } - auto special_casings = Detail::special_case_mapping(code_point); + auto special_casings = special_case_mapping(code_point); for (auto const* special_casing : special_casings) { if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) @@ -206,29 +219,20 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View u32 to_unicode_lowercase(u32 code_point) { -#if ENABLE_UNICODE_DATA - return Detail::simple_lowercase_mapping(code_point); -#else - return AK::to_ascii_lowercase(code_point); -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.simple_lowercase_mapping(code_point); } u32 to_unicode_uppercase(u32 code_point) { -#if ENABLE_UNICODE_DATA - return Detail::simple_uppercase_mapping(code_point); -#else - return AK::to_ascii_uppercase(code_point); -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.simple_uppercase_mapping(code_point); } -Optional<String> code_point_display_name([[maybe_unused]] u32 code_point) +Optional<String> code_point_display_name(u32 code_point) { -#if ENABLE_UNICODE_DATA - return Detail::code_point_display_name(code_point); -#else - return {}; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.code_point_display_name(code_point); } String to_unicode_lowercase_full(StringView string, [[maybe_unused]] Optional<StringView> locale) @@ -289,40 +293,28 @@ String to_unicode_uppercase_full(StringView string, [[maybe_unused]] Optional<St #endif } -Optional<GeneralCategory> general_category_from_string([[maybe_unused]] StringView general_category) +Optional<GeneralCategory> general_category_from_string(StringView general_category) { -#if ENABLE_UNICODE_DATA - return Detail::general_category_from_string(general_category); -#else - return {}; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.general_category_from_string(general_category); } -bool code_point_has_general_category([[maybe_unused]] u32 code_point, [[maybe_unused]] GeneralCategory general_category) +bool code_point_has_general_category(u32 code_point, GeneralCategory general_category) { -#if ENABLE_UNICODE_DATA - return Detail::code_point_has_general_category(code_point, general_category); -#else - return {}; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.code_point_has_general_category(code_point, general_category); } -Optional<Property> property_from_string([[maybe_unused]] StringView property) +Optional<Property> property_from_string(StringView property) { -#if ENABLE_UNICODE_DATA - return Detail::property_from_string(property); -#else - return {}; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.property_from_string(property); } -bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property) +bool code_point_has_property(u32 code_point, Property property) { -#if ENABLE_UNICODE_DATA - return Detail::code_point_has_property(code_point, property); -#else - return false; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.code_point_has_property(code_point, property); } bool is_ecma262_property([[maybe_unused]] Property property) @@ -392,31 +384,22 @@ bool is_ecma262_property([[maybe_unused]] Property property) #endif } -Optional<Script> script_from_string([[maybe_unused]] StringView script) +Optional<Script> script_from_string(StringView script) { -#if ENABLE_UNICODE_DATA - return Detail::script_from_string(script); -#else - return {}; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.script_from_string(script); } -bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script) +bool code_point_has_script(u32 code_point, Script script) { -#if ENABLE_UNICODE_DATA - return Detail::code_point_has_script(code_point, script); -#else - return false; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.code_point_has_script(code_point, script); } -bool code_point_has_script_extension([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script) +bool code_point_has_script_extension(u32 code_point, Script script) { -#if ENABLE_UNICODE_DATA - return Detail::code_point_has_script_extension(code_point, script); -#else - return false; -#endif + static auto const& symbols = Detail::Symbols::ensure_loaded(); + return symbols.code_point_has_script_extension(code_point, script); } } diff --git a/Userland/Libraries/LibUnicode/UnicodeSymbols.cpp b/Userland/Libraries/LibUnicode/UnicodeSymbols.cpp new file mode 100644 index 0000000000..c0b594e679 --- /dev/null +++ b/Userland/Libraries/LibUnicode/UnicodeSymbols.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <LibUnicode/UnicodeSymbols.h> + +#if ENABLE_UNICODE_DATA +# if defined(__serenity__) +# include <LibDl/dlfcn.h> +# include <LibDl/dlfcn_integration.h> +# else +# include <dlfcn.h> +# endif +#else +# include <AK/Function.h> +#endif + +namespace Unicode::Detail { + +#if !ENABLE_UNICODE_DATA + +template<typename T> +struct FunctionStub; + +template<typename ReturnType, typename... ParameterTypes> +struct FunctionStub<Function<ReturnType(ParameterTypes...)>> { + static constexpr auto make_stub() + { + return [](ParameterTypes...) -> ReturnType { return {}; }; + } +}; + +#endif + +// This loader supports 3 modes: +// +// 1. When the Unicode data generators are enabled, and the target is Serenity, the symbols are +// dynamically loaded from the shared library containing them. +// +// 2. When the Unicode data generators are enabled, and the target is Lagom, the symbols are +// dynamically loaded from the main program. +// +// 3. When the Unicode data generators are disabled, the symbols are stubbed out to empty lambdas. +// This allows callers to remain agnostic as to whether the generators are enabled. +Symbols const& Symbols::ensure_loaded() +{ + static Symbols symbols {}; + + static bool initialized = false; + if (initialized) + return symbols; + +#if ENABLE_UNICODE_DATA +# if defined(__serenity__) + static void* libunicodedata = MUST(__dlopen("libunicodedata.so.serenity", RTLD_NOW)); + + auto load_symbol = [&]<typename T>(T& dest, char const* name) { + dest = reinterpret_cast<T>(MUST(__dlsym(libunicodedata, name))); + }; +# else + static void* libunicodedata = dlopen(nullptr, RTLD_NOW); + VERIFY(libunicodedata); + + auto load_symbol = [&]<typename T>(T& dest, char const* name) { + dest = reinterpret_cast<T>(dlsym(libunicodedata, name)); + VERIFY(dest); + }; +# endif +#else + auto load_symbol = []<typename T>(T& dest, char const*) { + dest = +FunctionStub<Function<RemovePointer<T>>>::make_stub(); + }; +#endif + + load_symbol(symbols.code_point_display_name, "unicode_code_point_display_name"); + load_symbol(symbols.canonical_combining_class, "unicode_canonical_combining_class"); + load_symbol(symbols.simple_uppercase_mapping, "unicode_simple_uppercase_mapping"); + load_symbol(symbols.simple_lowercase_mapping, "unicode_simple_lowercase_mapping"); + load_symbol(symbols.special_case_mapping, "unicode_special_case_mapping"); + load_symbol(symbols.general_category_from_string, "unicode_general_category_from_string"); + load_symbol(symbols.code_point_has_general_category, "unicode_code_point_has_general_category"); + load_symbol(symbols.property_from_string, "unicode_property_from_string"); + load_symbol(symbols.code_point_has_property, "unicode_code_point_has_property"); + load_symbol(symbols.script_from_string, "unicode_script_from_string"); + load_symbol(symbols.code_point_has_script, "unicode_code_point_has_script"); + load_symbol(symbols.code_point_has_script_extension, "unicode_code_point_has_script_extension"); + + initialized = true; + return symbols; +} + +} diff --git a/Userland/Libraries/LibUnicode/UnicodeSymbols.h b/Userland/Libraries/LibUnicode/UnicodeSymbols.h new file mode 100644 index 0000000000..e4babe8218 --- /dev/null +++ b/Userland/Libraries/LibUnicode/UnicodeSymbols.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/Optional.h> +#include <AK/String.h> +#include <AK/StringView.h> +#include <AK/Types.h> +#include <LibUnicode/Forward.h> + +namespace Unicode::Detail { + +struct Symbols { + static Symbols const& ensure_loaded(); + + // Loaded from UnicodeData.cpp: + + Optional<String> (*code_point_display_name)(u32) { nullptr }; + + u32 (*canonical_combining_class)(u32 code_point) { nullptr }; + + u32 (*simple_uppercase_mapping)(u32) { nullptr }; + u32 (*simple_lowercase_mapping)(u32) { nullptr }; + Span<SpecialCasing const* const> (*special_case_mapping)(u32 code_point) { nullptr }; + + Optional<GeneralCategory> (*general_category_from_string)(StringView) { nullptr }; + bool (*code_point_has_general_category)(u32, GeneralCategory) { nullptr }; + + Optional<Property> (*property_from_string)(StringView) { nullptr }; + bool (*code_point_has_property)(u32, Property) { nullptr }; + + Optional<Script> (*script_from_string)(StringView) { nullptr }; + bool (*code_point_has_script)(u32, Script) { nullptr }; + bool (*code_point_has_script_extension)(u32, Script) { nullptr }; + +private: + Symbols() = default; +}; + +} |