summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibUnicode
diff options
context:
space:
mode:
authorIdan Horowitz <idan.horowitz@gmail.com>2022-01-31 18:20:52 +0200
committerIdan Horowitz <idan.horowitz@gmail.com>2022-01-31 21:05:04 +0200
commit58b0eed6a7a9b9396cad1a5a3096d34a10a66795 (patch)
tree47b86958d5052ce08211652d5ed6aef94e5f65f4 /Userland/Libraries/LibUnicode
parent44e8c05c675ea18b4583246ee0c572342cc19062 (diff)
downloadserenity-58b0eed6a7a9b9396cad1a5a3096d34a10a66795.zip
LibUnicode: Implement grapheme segmentation
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.cpp93
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.h2
2 files changed, 95 insertions, 0 deletions
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
index ace627530e..c16b188e88 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -8,6 +8,7 @@
#include <AK/Platform.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
+#include <AK/Utf16View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/Locale.h>
@@ -357,4 +358,96 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB
bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
+Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
+{
+#if ENABLE_UNICODE_DATA
+ using GBP = GraphemeBreakProperty;
+ Vector<size_t> boundaries;
+
+ // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+ if (view.length_in_code_points() == 0)
+ return boundaries;
+
+ auto has_any_gbp = [](u32 code_point, auto&&... properties) {
+ return (code_point_has_grapheme_break_property(code_point, properties) || ...);
+ };
+
+ // GB1
+ boundaries.append(0);
+
+ if (view.length_in_code_points() > 1) {
+ auto it = view.begin();
+ auto code_point = *it;
+ u32 next_code_point;
+ auto current_ri_chain = 0;
+ auto in_emoji_sequence = false;
+
+ for (++it; it != view.end(); ++it, code_point = next_code_point) {
+ next_code_point = *it;
+
+ auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
+ auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
+
+ // GB3
+ if (code_point_is_cr && next_code_point_is_lf)
+ continue;
+ // GB4, GB5
+ if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
+ boundaries.append(view.code_unit_offset_of(it));
+ continue;
+ }
+
+ auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
+ auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
+
+ // GB6
+ if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
+ continue;
+ // GB7
+ if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
+ continue;
+ // GB8
+ if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
+ continue;
+
+ auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
+ if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
+ in_emoji_sequence = true;
+ else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
+ in_emoji_sequence = false;
+
+ // GB9
+ if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
+ continue;
+ // GB9a
+ if (has_any_gbp(next_code_point, GBP::SpacingMark))
+ continue;
+ // GB9b
+ if (has_any_gbp(code_point, GBP::Prepend))
+ continue;
+
+ // GB11
+ if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
+ continue;
+
+ auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
+ current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
+
+ // GB12, GB13
+ if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
+ continue;
+
+ // GB999
+ boundaries.append(view.code_unit_offset_of(it));
+ }
+ }
+
+ // GB2
+ boundaries.append(view.length_in_code_units());
+ return boundaries;
+#else
+ return {};
+#endif
+}
+
}
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h
index def59cd149..5a06697781 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -44,4 +44,6 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert
bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
+Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
+
}