diff options
author | Timothy Flynn <trflynn89@pm.me> | 2023-02-25 15:51:43 -0500 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2023-02-25 22:23:39 +0100 |
commit | 73239fdd82aac798ac8789c70348a42a611b2759 (patch) | |
tree | 39060664b9e04a08af1de75f5fd85c8a1c0ce232 | |
parent | fa96811a220609f951a705ccc84e4458f0c7cf28 (diff) | |
download | serenity-73239fdd82aac798ac8789c70348a42a611b2759.zip |
LibUnicode: Add a unit test for Unicode grapheme and word segmentation
These include tests for previously broken boundary conditions.
-rw-r--r-- | Tests/LibUnicode/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Tests/LibUnicode/TestSegmentation.cpp | 101 |
2 files changed, 102 insertions, 0 deletions
diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt index 0b43421ce2..06e904c21a 100644 --- a/Tests/LibUnicode/CMakeLists.txt +++ b/Tests/LibUnicode/CMakeLists.txt @@ -1,5 +1,6 @@ set(TEST_SOURCES TestEmoji.cpp + TestSegmentation.cpp TestUnicodeCharacterTypes.cpp TestUnicodeNormalization.cpp ) diff --git a/Tests/LibUnicode/TestSegmentation.cpp b/Tests/LibUnicode/TestSegmentation.cpp new file mode 100644 index 0000000000..1159ae875b --- /dev/null +++ b/Tests/LibUnicode/TestSegmentation.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/Array.h> +#include <AK/StringView.h> +#include <AK/Utf8View.h> +#include <AK/Vector.h> +#include <LibTest/TestCase.h> +#include <LibUnicode/Segmentation.h> + +template<size_t N> +static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N]) +{ + Vector<size_t> boundaries; + Utf8View view { string }; + + Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) { + boundaries.append(boundary); + return IterationDecision::Continue; + }); + + EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries }); +} + +TEST_CASE(grapheme_segmentation) +{ + Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) { + VERIFY_NOT_REACHED(); + return IterationDecision::Break; + }); + + test_grapheme_segmentation("a"sv, { 0u, 1u }); + test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u }); + test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u }); + + test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u }); + test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u }); + test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u }); + + test_grapheme_segmentation("aαb"sv, { 0u, 1u, 4u, 5u }); + test_grapheme_segmentation("aααb"sv, { 0u, 1u, 7u, 8u }); + test_grapheme_segmentation("aαα’b"sv, { 0u, 1u, 7u, 8u }); + test_grapheme_segmentation("aακ°b"sv, { 0u, 1u, 7u, 8u }); + test_grapheme_segmentation("aακ°b"sv, { 0u, 1u, 7u, 8u }); + + test_grapheme_segmentation("aπb"sv, { 0u, 1u, 5u, 6u }); + test_grapheme_segmentation("aπ¨βπ©βπ§βπ¦b"sv, { 0u, 1u, 26u, 27u }); + test_grapheme_segmentation("aπ©πΌββ€οΈβπ¨π»b"sv, { 0u, 1u, 29u, 30u }); +} + +template<size_t N> +static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N]) +{ + Vector<size_t> boundaries; + Utf8View view { string }; + + Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) { + boundaries.append(boundary); + return IterationDecision::Continue; + }); + + EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries }); +} + +TEST_CASE(word_segmentation) +{ + Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) { + VERIFY_NOT_REACHED(); + return IterationDecision::Break; + }); + + test_word_segmentation("a"sv, { 0u, 1u }); + test_word_segmentation("ab"sv, { 0u, 2u }); + test_word_segmentation("abc"sv, { 0u, 3u }); + + test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u }); + test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u }); + test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u }); + test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u }); + test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u }); + test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u }); + + test_word_segmentation("aπb"sv, { 0u, 1u, 5u, 6u }); + test_word_segmentation("aπ¨βπ©βπ§βπ¦b"sv, { 0u, 1u, 26u, 27u }); + test_word_segmentation("aπ©πΌββ€οΈβπ¨π»b"sv, { 0u, 1u, 29u, 30u }); + + test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u }); + test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u }); + test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u }); + test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u }); + + test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u }); + test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u }); + + test_word_segmentation( + "The quick (βbrownβ) fox canβt jump 32.3 feet, right?"sv, + { 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u }); +} |