diff options
author | Idan Horowitz <idan.horowitz@gmail.com> | 2022-01-31 18:20:52 +0200 |
---|---|---|
committer | Idan Horowitz <idan.horowitz@gmail.com> | 2022-01-31 21:05:04 +0200 |
commit | 58b0eed6a7a9b9396cad1a5a3096d34a10a66795 (patch) | |
tree | 47b86958d5052ce08211652d5ed6aef94e5f65f4 /Userland/Libraries/LibUnicode | |
parent | 44e8c05c675ea18b4583246ee0c572342cc19062 (diff) | |
download | serenity-58b0eed6a7a9b9396cad1a5a3096d34a10a66795.zip |
LibUnicode: Implement grapheme segmentation
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r-- | Userland/Libraries/LibUnicode/CharacterTypes.cpp | 93 | ||||
-rw-r--r-- | Userland/Libraries/LibUnicode/CharacterTypes.h | 2 |
2 files changed, 95 insertions, 0 deletions
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index ace627530e..c16b188e88 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -8,6 +8,7 @@ #include <AK/Platform.h> #include <AK/StringBuilder.h> #include <AK/Types.h> +#include <AK/Utf16View.h> #include <AK/Utf8View.h> #include <LibUnicode/CharacterTypes.h> #include <LibUnicode/Locale.h> @@ -357,4 +358,96 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; } bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; } +Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view) +{ +#if ENABLE_UNICODE_DATA + using GBP = GraphemeBreakProperty; + Vector<size_t> boundaries; + + // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + if (view.length_in_code_points() == 0) + return boundaries; + + auto has_any_gbp = [](u32 code_point, auto&&... properties) { + return (code_point_has_grapheme_break_property(code_point, properties) || ...); + }; + + // GB1 + boundaries.append(0); + + if (view.length_in_code_points() > 1) { + auto it = view.begin(); + auto code_point = *it; + u32 next_code_point; + auto current_ri_chain = 0; + auto in_emoji_sequence = false; + + for (++it; it != view.end(); ++it, code_point = next_code_point) { + next_code_point = *it; + + auto code_point_is_cr = has_any_gbp(code_point, GBP::CR); + auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF); + + // GB3 + if (code_point_is_cr && next_code_point_is_lf) + continue; + // GB4, GB5 + if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { + boundaries.append(view.code_unit_offset_of(it)); + continue; + } + + auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V); + auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T); + + // GB6 + if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT))) + continue; + // GB7 + if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V)) + continue; + // GB8 + if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T)) + continue; + + auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ); + if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic)) + in_emoji_sequence = true; + else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj) + in_emoji_sequence = false; + + // GB9 + if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) + continue; + // GB9a + if (has_any_gbp(next_code_point, GBP::SpacingMark)) + continue; + // GB9b + if (has_any_gbp(code_point, GBP::Prepend)) + continue; + + // GB11 + if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic)) + continue; + + auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator); + current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; + + // GB12, GB13 + if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1) + continue; + + // GB999 + boundaries.append(view.code_unit_offset_of(it)); + } + } + + // GB2 + boundaries.append(view.length_in_code_units()); + return boundaries; +#else + return {}; +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index def59cd149..5a06697781 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -44,4 +44,6 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property); bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); +Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&); + } |