AK: Add `CircularBuffer::find_copy_in_seekback()`

This is useful for compressors, which quite frequently need to find a matching span of data within the seekback.
author: Tim Schumacher <timschumi@gmx.de> 2023-05-03 11:14:36 +0200
committer: Andreas Kling <kling@serenityos.org> 2023-05-17 09:08:53 +0200
commit: 221b91ff61aa4984577d76efcbaacfe1fafd61b4 (patch)
tree: 096999093758f4c5884c59f373a1599dc09f4ffb /AK/CircularBuffer.cpp
parent: d194011570cca93324137e3c697e7f95f2638fe6 (diff)
download: serenity-221b91ff61aa4984577d76efcbaacfe1fafd61b4.zip
1 files changed, 103 insertions, 0 deletions
diff --git a/AK/CircularBuffer.cpp b/AK/CircularBuffer.cpp
index 2d80aaaabb..05e6d7fbf1 100644
--- a/AK/CircularBuffer.cpp
+++ b/AK/CircularBuffer.cpp
@@ -249,4 +249,107 @@ ErrorOr<size_t> CircularBuffer::copy_from_seekback(size_t distance, size_t lengt
     return length - remaining_length;
 }
 
+ErrorOr<Vector<CircularBuffer::Match>> CircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length, Optional<Vector<size_t> const&> distance_hints) const
+{
+    VERIFY(minimum_length > 0);
+
+    // Clip the maximum length to the amount of data that we actually store.
+    if (maximum_length > m_used_space)
+        maximum_length = m_used_space;
+
+    if (maximum_length < minimum_length)
+        return Vector<Match> {};
+
+    Vector<Match> matches;
+
+    if (distance_hints.has_value()) {
+        // If we have any hints, verify and use those.
+        for (auto const& distance : distance_hints.value()) {
+            // TODO: This does not yet support looping repetitions.
+            if (distance < minimum_length)
+                continue;
+
+            auto needle_offset = (capacity() + m_reading_head) % capacity();
+            auto haystack_offset = (capacity() + m_reading_head - distance) % capacity();
+
+            for (size_t i = 0; i < minimum_length; i++) {
+                if (m_buffer[needle_offset] != m_buffer[haystack_offset])
+                    break;
+
+                needle_offset = (needle_offset + 1) % capacity();
+                haystack_offset = (haystack_offset + 1) % capacity();
+
+                if (i + 1 == minimum_length)
+                    TRY(matches.try_empend(distance, minimum_length));
+            }
+        }
+    } else {
+        // Otherwise, use memmem to find the initial matches.
+        // Note: We have the read head as our reference point, but `next_read_span_with_seekback` isn't aware of that and continues to use the write head.
+        //       Therefore, we need to make sure to slice off the extraneous bytes from the end of the span and shift the returned distances by the correct amount.
+        size_t haystack_offset_from_start = 0;
+        Vector<ReadonlyBytes, 2> haystack;
+        haystack.append(next_read_span_with_seekback(m_seekback_limit));
+        if (haystack[0].size() < m_seekback_limit - used_space())
+            haystack.append(next_read_span_with_seekback(m_seekback_limit - haystack[0].size()));
+
+        haystack.last() = haystack.last().trim(haystack.last().size() - used_space());
+
+        auto needle = next_read_span().trim(minimum_length);
+
+        auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
+        while (memmem_match.has_value()) {
+            auto match_offset = memmem_match.release_value();
+
+            // Add the match to the list of matches to work with.
+            TRY(matches.try_empend(m_seekback_limit - used_space() - haystack_offset_from_start - match_offset, minimum_length));
+
+            auto size_to_discard = match_offset + 1;
+
+            // Trim away the already processed bytes from the haystack.
+            haystack_offset_from_start += size_to_discard;
+            while (size_to_discard > 0) {
+                if (haystack[0].size() < size_to_discard) {
+                    size_to_discard -= haystack[0].size();
+                    haystack.remove(0);
+                } else {
+                    haystack[0] = haystack[0].slice(size_to_discard);
+                    break;
+                }
+            }
+
+            if (haystack.size() == 0)
+                break;
+
+            // Try and find the next match.
+            memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
+        }
+    }
+
+    // From now on, all matches that we have stored have at least a length of `minimum_length` and they all refer to the same value.
+    // For the remaining part, we will keep checking the next byte incrementally and keep eliminating matches until we eliminated all of them.
+    Vector<Match> next_matches;
+
+    for (size_t offset = minimum_length; offset < maximum_length; offset++) {
+        auto needle_data = m_buffer[(capacity() + m_reading_head + offset) % capacity()];
+
+        for (auto const& match : matches) {
+            auto haystack_data = m_buffer[(capacity() + m_reading_head - match.distance + offset) % capacity()];
+
+            if (haystack_data != needle_data)
+                continue;
+
+            TRY(next_matches.try_empend(match.distance, match.length + 1));
+        }
+
+        if (next_matches.size() == 0)
+            return matches;
+
+        swap(matches, next_matches);
+        next_matches.clear_with_capacity();
+    }
+
+    return matches;
+}
+
 }
author	Tim Schumacher <timschumi@gmx.de>	2023-05-03 11:14:36 +0200
committer	Andreas Kling <kling@serenityos.org>	2023-05-17 09:08:53 +0200
commit	221b91ff61aa4984577d76efcbaacfe1fafd61b4 (patch)
tree	096999093758f4c5884c59f373a1599dc09f4ffb /AK/CircularBuffer.cpp
parent	d194011570cca93324137e3c697e7f95f2638fe6 (diff)
download	serenity-221b91ff61aa4984577d76efcbaacfe1fafd61b4.zip