summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibUnicode
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-07-28 21:45:09 -0400
committerLinus Groh <mail@linusgroh.de>2021-07-30 21:26:31 +0100
commitf1809db9946200bcbe86df8c3d6ebf87b44ec0a1 (patch)
treebde429aaa4f69b5836f688e0bef032ec72e5d9de /Userland/Libraries/LibUnicode
parent3f80791ed535e0752109cb8900b56e7244e01669 (diff)
downloadserenity-f1809db9946200bcbe86df8c3d6ebf87b44ec0a1.zip
LibUnicode: Add public methods to compare and lookup Unicode properties
Adds methods to retrieve a Unicode property from a string and to check if a code point matches a Unicode property. Also adds a <LibUnicode/Forward.h> header.
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.cpp40
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.h4
-rw-r--r--Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp35
-rw-r--r--Userland/Libraries/LibUnicode/Forward.h22
4 files changed, 92 insertions, 9 deletions
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
index 0cdf901721..411da0da9f 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -4,6 +4,7 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
+#include <AK/CharacterTypes.h>
#include <AK/Platform.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
@@ -12,8 +13,6 @@
#if ENABLE_UNICODE_DATA
# include <LibUnicode/UnicodeData.h>
-#else
-# include <AK/CharacterTypes.h>
#endif
// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
@@ -41,7 +40,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
size_t cased_letter_count = 0;
for (auto code_point : preceding_view) {
- auto unicode_data = unicode_data_for_code_point(code_point);
+ auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value())
return false;
@@ -58,7 +57,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
return false;
for (auto code_point : following_view) {
- auto unicode_data = unicode_data_for_code_point(code_point);
+ auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value())
return false;
@@ -107,7 +106,7 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, s
u32 to_unicode_lowercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
- auto unicode_data = unicode_data_for_code_point(code_point);
+ auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (unicode_data.has_value())
return unicode_data->simple_lowercase_mapping;
return code_point;
@@ -119,7 +118,7 @@ u32 to_unicode_lowercase(u32 code_point)
u32 to_unicode_uppercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
- auto unicode_data = unicode_data_for_code_point(code_point);
+ auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (unicode_data.has_value())
return unicode_data->simple_uppercase_mapping;
return code_point;
@@ -139,7 +138,7 @@ String to_unicode_lowercase_full(StringView const& string)
u32 code_point = *it;
size_t byte_length = it.underlying_code_point_length_in_bytes();
- auto unicode_data = unicode_data_for_code_point(code_point);
+ auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value()) {
builder.append_code_point(code_point);
index += byte_length;
@@ -174,7 +173,7 @@ String to_unicode_uppercase_full(StringView const& string)
u32 code_point = *it;
size_t byte_length = it.underlying_code_point_length_in_bytes();
- auto unicode_data = unicode_data_for_code_point(code_point);
+ auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value()) {
builder.append_code_point(code_point);
index += byte_length;
@@ -198,4 +197,29 @@ String to_unicode_uppercase_full(StringView const& string)
#endif
}
+Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
+{
+#if ENABLE_UNICODE_DATA
+ return Detail::property_from_string(property);
+#else
+ return {};
+#endif
+}
+
+bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
+{
+#if ENABLE_UNICODE_DATA
+ if (property == Property::Any)
+ return is_unicode(code_point);
+
+ auto unicode_data = Detail::unicode_data_for_code_point(code_point);
+ if (!unicode_data.has_value())
+ return false;
+
+ return has_property(*unicode_data, property);
+#else
+ return false;
+#endif
+}
+
}
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h
index beb2288cfb..eac6e79293 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -9,6 +9,7 @@
#include <AK/Forward.h>
#include <AK/String.h>
#include <AK/Types.h>
+#include <LibUnicode/Forward.h>
namespace Unicode {
@@ -20,4 +21,7 @@ u32 to_unicode_uppercase(u32 code_point);
String to_unicode_lowercase_full(StringView const&);
String to_unicode_uppercase_full(StringView const&);
+Optional<Property> property_from_string(StringView const&);
+bool code_point_has_property(u32 code_point, Property property);
+
}
diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
index 9744fc92e5..7132d86dd6 100644
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@@ -404,6 +404,7 @@ constexpr @name@ operator|(@name@ value1, @name@ value2)
#include <AK/Optional.h>
#include <AK/Types.h>
+#include <LibUnicode/Forward.h>
namespace Unicode {
)~~~");
@@ -411,7 +412,7 @@ namespace Unicode {
generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
generate_enum("GeneralCategory"sv, {}, move(unicode_data.general_categories));
- generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), move(unicode_data.prop_aliases), true);
+ generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true);
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
generator.append(R"~~~(
@@ -469,7 +470,12 @@ struct UnicodeData {
WordBreakProperty word_break_property { WordBreakProperty::Other };
};
+namespace Detail {
+
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
+Optional<Property> property_from_string(StringView const& property);
+
+}
})~~~");
@@ -489,6 +495,7 @@ static void generate_unicode_data_implementation(UnicodeData unicode_data)
#include <AK/Array.h>
#include <AK/CharacterTypes.h>
#include <AK/Find.h>
+#include <AK/StringView.h>
#include <LibUnicode/UnicodeData.h>
namespace Unicode {
@@ -597,6 +604,8 @@ static Optional<u32> index_of_code_point_in_range(u32 code_point)
return {};
}
+namespace Detail {
+
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
{
VERIFY(is_unicode(code_point));
@@ -618,6 +627,30 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
return {};
}
+Optional<Property> property_from_string(StringView const& property)
+{
+ if (property == "Assigned"sv)
+ return Property::Assigned;)~~~");
+
+ for (auto const& property : unicode_data.prop_list) {
+ generator.set("property", property.key);
+ generator.append(R"~~~(
+ if (property == "@property@"sv)
+ return Property::@property@;)~~~");
+ }
+ for (auto const& alias : unicode_data.prop_aliases) {
+ generator.set("property", alias.alias);
+ generator.append(R"~~~(
+ if (property == "@property@"sv)
+ return Property::@property@;)~~~");
+ }
+
+ generator.append(R"~~~(
+ return {};
+}
+
+}
+
})~~~");
outln("{}", generator.as_string_view());
diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h
new file mode 100644
index 0000000000..0c6aa4cb08
--- /dev/null
+++ b/Userland/Libraries/LibUnicode/Forward.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Types.h>
+
+namespace Unicode {
+
+enum class Condition;
+enum class GeneralCategory;
+enum class Locale;
+enum class Property : u64;
+enum class WordBreakProperty;
+
+struct SpecialCasing;
+struct UnicodeData;
+
+}