summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibUnicode
diff options
context:
space:
mode:
authorIdan Horowitz <idan.horowitz@gmail.com>2022-01-31 18:22:24 +0200
committerIdan Horowitz <idan.horowitz@gmail.com>2022-01-31 21:05:04 +0200
commit4967bcd4ce32f35144ee2b9b5bfe6980981e553a (patch)
tree66a329f5d49355ae459daf4573180c5b9a5ff5b3 /Userland/Libraries/LibUnicode
parenta593a5c8ab1f5cdf333e235fb3e32ba78b4c71ed (diff)
downloadserenity-4967bcd4ce32f35144ee2b9b5bfe6980981e553a.zip
LibUnicode: Implement sentence segmentation
Diffstat (limited to 'Userland/Libraries/LibUnicode')
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.cpp116
-rw-r--r--Userland/Libraries/LibUnicode/CharacterTypes.h1
2 files changed, 117 insertions, 0 deletions
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
index aea06ce489..26b3acc65b 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -602,4 +602,120 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
#endif
}
+Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
+{
+#if ENABLE_UNICODE_DATA
+ using SBP = SentenceBreakProperty;
+ Vector<size_t> boundaries;
+
+ // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+ if (view.length_in_code_points() == 0)
+ return boundaries;
+
+ auto has_any_sbp = [](u32 code_point, auto&&... properties) {
+ return (code_point_has_sentence_break_property(code_point, properties) || ...);
+ };
+
+ // SB1
+ boundaries.append(0);
+
+ if (view.length_in_code_points() > 1) {
+ auto it = view.begin();
+ auto code_point = *it;
+ u32 next_code_point;
+ Optional<u32> previous_code_point;
+ enum class TerminatorSequenceState {
+ None,
+ Term,
+ Close,
+ Sp
+ } terminator_sequence_state { TerminatorSequenceState::None };
+ auto term_was_a_term = false;
+
+ for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
+ next_code_point = *it;
+
+ auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
+ auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
+
+ // SB3
+ if (code_point_is_cr && next_code_point_is_lf)
+ continue;
+
+ auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
+
+ // SB4
+ if (code_point_is_para_sep) {
+ boundaries.append(view.code_unit_offset_of(it));
+ continue;
+ }
+
+ // SB5
+ if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
+ continue;
+
+ auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
+
+ // SB6
+ if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
+ continue;
+ // SB7
+ if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
+ continue;
+
+ if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
+ terminator_sequence_state = TerminatorSequenceState::Term;
+ term_was_a_term = code_point_is_a_term;
+ } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
+ terminator_sequence_state = TerminatorSequenceState::Close;
+ } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
+ terminator_sequence_state = TerminatorSequenceState::Sp;
+ } else {
+ terminator_sequence_state = TerminatorSequenceState::None;
+ }
+
+ // SB8
+ if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
+ auto it_copy = it;
+ bool illegal_sequence = false;
+ for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
+ if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
+ continue;
+ illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
+ }
+ if (illegal_sequence)
+ continue;
+ }
+
+ // SB8a
+ if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
+ continue;
+
+ auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
+ auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
+
+ // SB9
+ if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
+ continue;
+
+ // SB10
+ if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
+ continue;
+
+ // SB11
+ if (terminator_sequence_state >= TerminatorSequenceState::Term)
+ boundaries.append(view.code_unit_offset_of(it));
+
+ // SB998
+ }
+ }
+
+ // SB2
+ boundaries.append(view.length_in_code_units());
+ return boundaries;
+#else
+ return {};
+#endif
+}
+
}
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h
index ce53957fc6..a8acca871c 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -46,5 +46,6 @@ bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakPropert
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
+Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);
}