diff options
author | Timothy Flynn <trflynn89@pm.me> | 2023-03-30 09:18:59 -0400 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2023-03-31 06:56:05 +0200 |
commit | 62b575ad7c0c337063aa218b7d9fd968d91f78b3 (patch) | |
tree | c986de803e79eea812d6b740753f4141b75a81dc /Userland/Libraries/LibCrypto/Checksum | |
parent | 83cb73a0452c39cbaf26b0c1d05ed5a219af8d24 (diff) | |
download | serenity-62b575ad7c0c337063aa218b7d9fd968d91f78b3.zip |
LibCrypto: Implement little endian CRC using the slicing-by-8 algorithm
This implements Intel's slicing-by-8 algorithm for CRC checksums (only
little endian CPUs for now, as I don't have a way to test big endian).
The original paper for this algorithm seems to have disappeared, but
Intel's source code is still available as a reference:
https://sourceforge.net/projects/slicing-by-8/
As well as other implementations for reference:
https://docs.rs/slice-by-8/latest/src/slice_by_8/algorithm.rs.html
Using the "enwik8" file as a test (100MB uncompressed, commonly used in
benchmarks: https://www.mattmahoney.net/dc/enwik8.zip), decompression
time decreases from:
4.89s to 3.52s on Serenity (cold)
1.72s to 1.32s on Serenity (warm)
1.06s to 0.92s on Linux
Diffstat (limited to 'Userland/Libraries/LibCrypto/Checksum')
-rw-r--r-- | Userland/Libraries/LibCrypto/Checksum/CRC32.cpp | 95 |
1 files changed, 94 insertions, 1 deletions
diff --git a/Userland/Libraries/LibCrypto/Checksum/CRC32.cpp b/Userland/Libraries/LibCrypto/Checksum/CRC32.cpp index 7cdddfb274..e8ee7fd578 100644 --- a/Userland/Libraries/LibCrypto/Checksum/CRC32.cpp +++ b/Userland/Libraries/LibCrypto/Checksum/CRC32.cpp @@ -5,6 +5,7 @@ */ #include <AK/Array.h> +#include <AK/NumericLimits.h> #include <AK/Span.h> #include <AK/Types.h> #include <LibCrypto/Checksum/CRC32.h> @@ -46,6 +47,97 @@ void CRC32::update(ReadonlyBytes span) #else +static constexpr size_t ethernet_polynomial = 0xEDB88320; + +# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + +// This implements Intel's slicing-by-8 algorithm. Their original paper is no longer on their website, +// but their source code is still available for reference: +// https://sourceforge.net/projects/slicing-by-8/ +static constexpr auto generate_table() +{ + Array<Array<u32, 256>, 8> data {}; + + for (u32 i = 0; i < 256; ++i) { + auto value = i; + + for (size_t j = 0; j < 8; ++j) + value = (value >> 1) ^ ((value & 1) * ethernet_polynomial); + + data[0][i] = value; + } + + for (u32 i = 0; i < 256; ++i) { + for (size_t j = 1; j < 8; ++j) + data[j][i] = (data[j - 1][i] >> 8) ^ data[0][data[j - 1][i] & 0xff]; + } + + return data; +} + +static constexpr auto table = generate_table(); + +struct AlignmentData { + ReadonlyBytes misaligned; + ReadonlyBytes aligned; +}; + +static AlignmentData split_bytes_for_alignment(ReadonlyBytes data, size_t alignment) +{ + auto address = reinterpret_cast<uintptr_t>(data.data()); + auto offset = alignment - address % alignment; + + if (offset == alignment) + return { {}, data }; + + if (data.size() < alignment) + return { data, {} }; + + return { data.trim(offset), data.slice(offset) }; +} + +static constexpr u32 single_byte_crc(u32 crc, u8 byte) +{ + return (crc >> 8) ^ table[0][(crc & 0xff) ^ byte]; +} + +void CRC32::update(ReadonlyBytes data) +{ + // The provided data may not be aligned to a 4-byte boundary, required to reinterpret its address + // into a u32 in the loop below. So we split the bytes into two segments: the misaligned bytes + // (which undergo the standard 1-byte-at-a-time algorithm) and remaining aligned bytes. + auto [misaligned_data, aligned_data] = split_bytes_for_alignment(data, alignof(u32)); + + for (auto byte : misaligned_data) + m_state = single_byte_crc(m_state, byte); + + while (aligned_data.size() >= 8) { + auto const* segment = reinterpret_cast<u32 const*>(aligned_data.data()); + auto low = *segment ^ m_state; + auto high = *(++segment); + + // clang-format will put this all on one line, which is really hard to read. + // clang-format off + m_state = table[0][(high >> 24) & 0xff] + ^ table[1][(high >> 16) & 0xff] + ^ table[2][(high >> 8) & 0xff] + ^ table[3][high & 0xff] + ^ table[4][(low >> 24) & 0xff] + ^ table[5][(low >> 16) & 0xff] + ^ table[6][(low >> 8) & 0xff] + ^ table[7][low & 0xff]; + // clang-format on + + aligned_data = aligned_data.slice(8); + } + + for (auto byte : aligned_data) + m_state = single_byte_crc(m_state, byte); +}; + +# else + +// FIXME: Implement the slicing-by-8 algorithm for big endian CPUs. static constexpr auto generate_table() { Array<u32, 256> data {}; @@ -54,7 +146,7 @@ static constexpr auto generate_table() for (auto j = 0; j < 8; j++) { if (value & 1) { - value = 0xEDB88320 ^ (value >> 1); + value = ethernet_polynomial ^ (value >> 1); } else { value = value >> 1; } @@ -74,6 +166,7 @@ void CRC32::update(ReadonlyBytes data) } }; +# endif #endif u32 CRC32::digest() |