summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2023-02-25 15:51:43 -0500
committerLinus Groh <mail@linusgroh.de>2023-02-25 22:23:39 +0100
commit73239fdd82aac798ac8789c70348a42a611b2759 (patch)
tree39060664b9e04a08af1de75f5fd85c8a1c0ce232
parentfa96811a220609f951a705ccc84e4458f0c7cf28 (diff)
downloadserenity-73239fdd82aac798ac8789c70348a42a611b2759.zip
LibUnicode: Add a unit test for Unicode grapheme and word segmentation
These include tests for previously broken boundary conditions.
-rw-r--r--Tests/LibUnicode/CMakeLists.txt1
-rw-r--r--Tests/LibUnicode/TestSegmentation.cpp101
2 files changed, 102 insertions, 0 deletions
diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt
index 0b43421ce2..06e904c21a 100644
--- a/Tests/LibUnicode/CMakeLists.txt
+++ b/Tests/LibUnicode/CMakeLists.txt
@@ -1,5 +1,6 @@
set(TEST_SOURCES
TestEmoji.cpp
+ TestSegmentation.cpp
TestUnicodeCharacterTypes.cpp
TestUnicodeNormalization.cpp
)
diff --git a/Tests/LibUnicode/TestSegmentation.cpp b/Tests/LibUnicode/TestSegmentation.cpp
new file mode 100644
index 0000000000..1159ae875b
--- /dev/null
+++ b/Tests/LibUnicode/TestSegmentation.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Array.h>
+#include <AK/StringView.h>
+#include <AK/Utf8View.h>
+#include <AK/Vector.h>
+#include <LibTest/TestCase.h>
+#include <LibUnicode/Segmentation.h>
+
+template<size_t N>
+static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
+{
+ Vector<size_t> boundaries;
+ Utf8View view { string };
+
+ Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
+ boundaries.append(boundary);
+ return IterationDecision::Continue;
+ });
+
+ EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
+}
+
+TEST_CASE(grapheme_segmentation)
+{
+ Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) {
+ VERIFY_NOT_REACHED();
+ return IterationDecision::Break;
+ });
+
+ test_grapheme_segmentation("a"sv, { 0u, 1u });
+ test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
+ test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
+
+ test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
+ test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
+ test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
+
+ test_grapheme_segmentation("aα„€b"sv, { 0u, 1u, 4u, 5u });
+ test_grapheme_segmentation("aα„€α„€b"sv, { 0u, 1u, 7u, 8u });
+ test_grapheme_segmentation("aα„€α†’b"sv, { 0u, 1u, 7u, 8u });
+ test_grapheme_segmentation("aα„€κ°€b"sv, { 0u, 1u, 7u, 8u });
+ test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
+
+ test_grapheme_segmentation("aπŸ˜€b"sv, { 0u, 1u, 5u, 6u });
+ test_grapheme_segmentation("aπŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦b"sv, { 0u, 1u, 26u, 27u });
+ test_grapheme_segmentation("aπŸ‘©πŸΌβ€β€οΈβ€πŸ‘¨πŸ»b"sv, { 0u, 1u, 29u, 30u });
+}
+
+template<size_t N>
+static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
+{
+ Vector<size_t> boundaries;
+ Utf8View view { string };
+
+ Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) {
+ boundaries.append(boundary);
+ return IterationDecision::Continue;
+ });
+
+ EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
+}
+
+TEST_CASE(word_segmentation)
+{
+ Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) {
+ VERIFY_NOT_REACHED();
+ return IterationDecision::Break;
+ });
+
+ test_word_segmentation("a"sv, { 0u, 1u });
+ test_word_segmentation("ab"sv, { 0u, 2u });
+ test_word_segmentation("abc"sv, { 0u, 3u });
+
+ test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
+ test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
+ test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
+ test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
+ test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
+ test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
+
+ test_word_segmentation("aπŸ˜€b"sv, { 0u, 1u, 5u, 6u });
+ test_word_segmentation("aπŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦b"sv, { 0u, 1u, 26u, 27u });
+ test_word_segmentation("aπŸ‘©πŸΌβ€β€οΈβ€πŸ‘¨πŸ»b"sv, { 0u, 1u, 29u, 30u });
+
+ test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
+ test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
+ test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
+ test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
+
+ test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
+ test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
+
+ test_word_segmentation(
+ "The quick (β€œbrown”) fox can’t jump 32.3 feet, right?"sv,
+ { 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
+}