diff options
author | Timothy Flynn <trflynn89@pm.me> | 2023-02-25 10:36:21 -0500 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2023-02-25 22:23:39 +0100 |
commit | fa96811a220609f951a705ccc84e4458f0c7cf28 (patch) | |
tree | 85e040b03805a3f3416699b4fc1aa19587f34e0d /Userland/Libraries/LibUnicode | |
parent | 09d40bfbb24588b5659f17e2701b5c367a447110 (diff) | |
download | serenity-fa96811a220609f951a705ccc84e4458f0c7cf28.zip |
LibUnicode: Skip over emoji sequences in grapheme boundary segmentation
Emoji sequences in the grapheme segmentation spec are a bit tricky:
\p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
Our current strategy of tracking a boolean to indicate if we are in an
emoji sequence was causing us to break up emoji made of multiple sub-
sequences. For example, in the "family: man, woman, girl, boy" sequence:
U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466
We would break at indices 0 (correctly) and 6 (incorrectly).
Instead of tracking a boolean, it's quite a bit simpler to reason about
emoji sequences by just skipping past them entirely. Note that in cases
like the above emoji, we skip one sub-sequence at a time.
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r-- | Userland/Libraries/LibUnicode/Segmentation.cpp | 31 |
1 files changed, 19 insertions, 12 deletions
diff --git a/Userland/Libraries/LibUnicode/Segmentation.cpp b/Userland/Libraries/LibUnicode/Segmentation.cpp index 47715efe84..750f52907a 100644 --- a/Userland/Libraries/LibUnicode/Segmentation.cpp +++ b/Userland/Libraries/LibUnicode/Segmentation.cpp @@ -64,13 +64,30 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy if (code_unit_length(view) > 1) { auto it = view.begin(); auto code_point = *it; - u32 next_code_point; + u32 next_code_point = 0; auto current_ri_chain = 0; - auto in_emoji_sequence = false; for (++it; it != view.end(); ++it, code_point = next_code_point) { next_code_point = *it; + // GB11 + if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) { + auto it_copy = it; + + while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend)) + ++it_copy; + + if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) { + ++it_copy; + + if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) { + next_code_point = *it_copy; + it = it_copy; + continue; + } + } + } + auto code_point_is_cr = has_any_gbp(code_point, GBP::CR); auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF); @@ -97,12 +114,6 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T)) continue; - auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ); - if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic)) - in_emoji_sequence = true; - else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj) - in_emoji_sequence = false; - // GB9 if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) continue; @@ -113,10 +124,6 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy if (has_any_gbp(code_point, GBP::Prepend)) continue; - // GB11 - if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic)) - continue; - auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator); current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; |