LibCompress: Add support for LZMA streams

author: Tim Schumacher <timschumi@gmx.de> 2023-03-05 14:34:05 +0100
committer: Idan Horowitz <idan.horowitz@gmail.com> 2023-03-20 12:15:38 +0200
commit: b3a9729e23a00d40d63b63d7a6c5c8075dc5d481 (patch)
tree: 04c4f8a1ec781c074335630264801cb4e8a9cf0a /Userland
parent: 5c4ffbcb4654509ce238b3d3dc1edba7ee6c38c9 (diff)
download: serenity-b3a9729e23a00d40d63b63d7a6c5c8075dc5d481.zip
3 files changed, 812 insertions, 0 deletions
diff --git a/Userland/Libraries/LibCompress/CMakeLists.txt b/Userland/Libraries/LibCompress/CMakeLists.txt
index fbc4fe2f62..c30ffd0397 100644
--- a/Userland/Libraries/LibCompress/CMakeLists.txt
+++ b/Userland/Libraries/LibCompress/CMakeLists.txt
@@ -2,6 +2,7 @@ set(SOURCES
     Brotli.cpp
     BrotliDictionary.cpp
     Deflate.cpp
+    Lzma.cpp
     Zlib.cpp
     Gzip.cpp
 )
diff --git a/Userland/Libraries/LibCompress/Lzma.cpp b/Userland/Libraries/LibCompress/Lzma.cpp
new file mode 100644
index 0000000000..ccd3440f85
--- /dev/null
+++ b/Userland/Libraries/LibCompress/Lzma.cpp
@@ -0,0 +1,658 @@
+/*
+ * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibCompress/Lzma.h>
+
+namespace Compress {
+
+u32 LzmaHeader::dictionary_size() const
+{
+    // "If the value of dictionary size in properties is smaller than (1 << 12),
+    //  the LZMA decoder must set the dictionary size variable to (1 << 12)."
+    constexpr u32 minimum_dictionary_size = (1 << 12);
+    if (m_dictionary_size < minimum_dictionary_size)
+        return minimum_dictionary_size;
+
+    return m_dictionary_size;
+}
+
+Optional<u64> LzmaHeader::uncompressed_size() const
+{
+    // We are making a copy of the packed field here because we would otherwise
+    // pass an unaligned reference to the constructor of Optional, which is
+    // undefined behavior.
+    auto uncompressed_size = m_uncompressed_size;
+
+    // "If "Uncompressed size" field contains ones in all 64 bits, it means that
+    //  uncompressed size is unknown and there is the "end marker" in stream,
+    //  that indicates the end of decoding point."
+    if (uncompressed_size == UINT64_MAX)
+        return {};
+
+    // "In opposite case, if the value from "Uncompressed size" field is not
+    //  equal to ((2^64) - 1), the LZMA stream decoding must be finished after
+    //  specified number of bytes (Uncompressed size) is decoded. And if there
+    //  is the "end marker", the LZMA decoder must read that marker also."
+    return uncompressed_size;
+}
+
+ErrorOr<void> LzmaHeader::decode_model_properties(u8& literal_context_bits, u8& literal_pos_bits, u8& pos_bits) const
+{
+    // "Decodes the following values from the encoded model properties field:
+    //
+    //     name  Range          Description
+    //       lc  [0, 8]         the number of "literal context" bits
+    //       lp  [0, 4]         the number of "literal pos" bits
+    //       pb  [0, 4]         the number of "pos" bits
+    //
+    //  Encoded using `((pb * 5 + lp) * 9 + lc)`."
+
+    u8 input_bits = m_model_properties;
+
+    if (input_bits >= (9 * 5 * 5))
+        return Error::from_string_literal("Encoded model properties value is larger than the highest possible value");
+
+    literal_context_bits = input_bits % 9;
+    input_bits /= 9;
+
+    literal_pos_bits = input_bits % 5;
+    input_bits /= 5;
+
+    pos_bits = input_bits;
+
+    return {};
+}
+
+ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
+{
+    u8 literal_context_bits { 0 };
+    u8 literal_position_bits { 0 };
+    u8 position_bits { 0 };
+
+    TRY(decode_model_properties(literal_context_bits, literal_position_bits, position_bits));
+
+    return LzmaDecompressorOptions {
+        .literal_context_bits = literal_context_bits,
+        .literal_position_bits = literal_position_bits,
+        .position_bits = position_bits,
+        .dictionary_size = dictionary_size(),
+        .uncompressed_size = uncompressed_size(),
+    };
+}
+
+void LzmaDecompressor::initialize_to_default_probability(Span<Probability> span)
+{
+    for (auto& entry : span)
+        entry = default_probability;
+}
+
+ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_container(MaybeOwned<Stream> stream)
+{
+    auto header = TRY(stream->read_value<LzmaHeader>());
+
+    return TRY(LzmaDecompressor::create_from_raw_stream(move(stream), TRY(header.as_decompressor_options())));
+}
+
+ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_raw_stream(MaybeOwned<Stream> stream, LzmaDecompressorOptions const& options)
+{
+    // "The LZMA Encoder always writes ZERO in initial byte of compressed stream.
+    //  That scheme allows to simplify the code of the Range Encoder in the
+    //  LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must
+    //  stop decoding and report error."
+    {
+        auto byte = TRY(stream->read_value<u8>());
+        if (byte != 0)
+            return Error::from_string_literal("Initial byte of data stream is not zero");
+    }
+
+    auto output_buffer = TRY(CircularBuffer::create_empty(options.dictionary_size));
+
+    // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
+    auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits))));
+
+    auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaDecompressor(move(stream), options, move(output_buffer), move(literal_probabilities))));
+
+    // Read the initial bytes into the range decoder.
+    for (size_t i = 0; i < 4; i++) {
+        auto byte = TRY(decompressor->m_stream->read_value<u8>());
+        decompressor->m_range_decoder_code = decompressor->m_range_decoder_code << 8 | byte;
+    }
+
+    return decompressor;
+}
+
+LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, CircularBuffer output_buffer, FixedArray<Probability> literal_probabilities)
+    : m_stream(move(stream))
+    , m_options(move(options))
+    , m_output_buffer(move(output_buffer))
+    , m_literal_probabilities(move(literal_probabilities))
+{
+    initialize_to_default_probability(m_literal_probabilities.span());
+
+    for (auto& array : m_length_to_position_states)
+        initialize_to_default_probability(array);
+
+    for (auto& array : m_binary_tree_distance_probabilities)
+        initialize_to_default_probability(array);
+
+    initialize_to_default_probability(m_alignment_bit_probabilities);
+
+    initialize_to_default_probability(m_is_match_probabilities);
+    initialize_to_default_probability(m_is_rep_probabilities);
+    initialize_to_default_probability(m_is_rep_g0_probabilities);
+    initialize_to_default_probability(m_is_rep_g1_probabilities);
+    initialize_to_default_probability(m_is_rep_g2_probabilities);
+    initialize_to_default_probability(m_is_rep0_long_probabilities);
+}
+
+ErrorOr<void> LzmaDecompressor::normalize_range_decoder()
+{
+    // "The value of the "Range" variable before each bit decoding can not be smaller
+    //  than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in
+    //  described range."
+    constexpr u32 minimum_range_value = 1 << 24;
+
+    if (m_range_decoder_range >= minimum_range_value)
+        return {};
+
+    m_range_decoder_range <<= 8;
+    m_range_decoder_code <<= 8;
+
+    m_range_decoder_code |= TRY(m_stream->read_value<u8>());
+
+    VERIFY(m_range_decoder_range >= minimum_range_value);
+
+    return {};
+}
+
+ErrorOr<u8> LzmaDecompressor::decode_direct_bit()
+{
+    m_range_decoder_range >>= 1;
+    m_range_decoder_code -= m_range_decoder_range;
+
+    u32 temp = 0 - (m_range_decoder_code >> 31);
+
+    m_range_decoder_code += m_range_decoder_range & temp;
+
+    if (m_range_decoder_code == m_range_decoder_range)
+        return Error::from_string_literal("Reached an invalid state while decoding LZMA stream");
+
+    TRY(normalize_range_decoder());
+
+    return temp + 1;
+}
+
+ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probability)
+{
+    // "The LZMA decoder provides the pointer to CProb variable that contains
+    //  information about estimated probability for symbol 0 and the Range Decoder
+    //  updates that CProb variable after decoding."
+
+    // The significance of the shift width is not explained and appears to be a magic constant.
+    constexpr size_t probability_shift_width = 5;
+
+    u32 bound = (m_range_decoder_range >> probability_bit_count) * probability;
+
+    if (m_range_decoder_code < bound) {
+        probability += ((1 << probability_bit_count) - probability) >> probability_shift_width;
+        m_range_decoder_range = bound;
+        TRY(normalize_range_decoder());
+        return 0;
+    } else {
+        probability -= probability >> probability_shift_width;
+        m_range_decoder_code -= bound;
+        m_range_decoder_range -= bound;
+        TRY(normalize_range_decoder());
+        return 1;
+    }
+}
+
+ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree)
+{
+    VERIFY(bit_count <= sizeof(u16) * 8);
+    VERIFY(probability_tree.size() >= 1ul << bit_count);
+
+    // This has been modified from the reference implementation to unlink the result and the tree index,
+    // which should allow for better readability.
+
+    u16 result = 0;
+    size_t tree_index = 1;
+
+    for (size_t i = 0; i < bit_count; i++) {
+        u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
+        result = (result << 1) | next_bit;
+        tree_index = (tree_index << 1) | next_bit;
+    }
+
+    return result;
+}
+
+ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree)
+{
+    VERIFY(bit_count <= sizeof(u16) * 8);
+    VERIFY(probability_tree.size() >= 1ul << bit_count);
+
+    u16 result = 0;
+    size_t tree_index = 1;
+
+    for (size_t i = 0; i < bit_count; i++) {
+        u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
+        result |= next_bit << i;
+        tree_index = (tree_index << 1) | next_bit;
+    }
+
+    return result;
+}
+
+ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()
+{
+    u8 previous_byte = 0;
+    if (m_total_decoded_bytes > 0) {
+        auto read_bytes = MUST(m_output_buffer.read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 1));
+        VERIFY(read_bytes.size() == sizeof(previous_byte));
+    }
+
+    // "To select the table for decoding it uses the context that consists of
+    //  (lc) high bits from previous literal and (lp) low bits from value that
+    //  represents current position in outputStream."
+    u16 literal_state_bits_from_position = m_total_decoded_bytes & ((1 << m_options.literal_position_bits) - 1);
+    u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
+    u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;
+
+    Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size);
+
+    // The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer.
+    // The top bit is only used to track how much we have decoded already, and to select the correct probability table.
+    u16 result = 1;
+
+    // "If (State > 7), the Literal Decoder also uses "matchByte" that represents
+    //  the byte in OutputStream at position the is the DISTANCE bytes before
+    //  current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair
+    //  of latest decoded match."
+    // Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch.
+    //       Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one.
+    if (m_state >= 7) {
+        u8 matched_byte = 0;
+        auto read_bytes = TRY(m_output_buffer.read_with_seekback({ &matched_byte, sizeof(matched_byte) }, m_rep0 + 1));
+        VERIFY(read_bytes.size() == sizeof(matched_byte));
+
+        do {
+            u8 match_bit = (matched_byte >> 7) & 1;
+            matched_byte <<= 1;
+
+            u8 decoded_bit = TRY(decode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result]));
+            result = result << 1 | decoded_bit;
+
+            if (match_bit != decoded_bit)
+                break;
+        } while (result < 0x100);
+    }
+
+    while (result < 0x100)
+        result = (result << 1) | TRY(decode_bit_with_probability(selected_probability_table[result]));
+
+    u8 actual_result = result - 0x100;
+
+    size_t written_bytes = m_output_buffer.write({ &actual_result, sizeof(actual_result) });
+    VERIFY(written_bytes == sizeof(actual_result));
+    m_total_decoded_bytes += sizeof(actual_result);
+
+    return {};
+}
+
+LzmaDecompressor::LzmaLengthDecoderState::LzmaLengthDecoderState()
+{
+    for (auto& array : m_low_length_probabilities)
+        initialize_to_default_probability(array);
+
+    for (auto& array : m_medium_length_probabilities)
+        initialize_to_default_probability(array);
+
+    initialize_to_default_probability(m_high_length_probabilities);
+}
+
+ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthDecoderState& length_decoder_state)
+{
+    // "LZMA uses "posState" value as context to select the binary tree
+    //  from LowCoder and MidCoder binary tree arrays:"
+    u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1);
+
+    // "The following scheme is used for the match length encoding:
+    //
+    //   Binary encoding    Binary Tree structure    Zero-based match length
+    //   sequence                                    (binary + decimal):
+    //
+    //   0 xxx              LowCoder[posState]       xxx
+    if (TRY(decode_bit_with_probability(length_decoder_state.m_first_choice_probability)) == 0)
+        return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_low_length_probabilities[position_state].span()));
+
+    //   1 0 yyy            MidCoder[posState]       yyy + 8
+    if (TRY(decode_bit_with_probability(length_decoder_state.m_second_choice_probability)) == 0)
+        return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_medium_length_probabilities[position_state].span())) + 8;
+
+    //   1 1 zzzzzzzz       HighCoder                zzzzzzzz + 16"
+    return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16;
+}
+
+ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length)
+{
+    // "LZMA uses normalized match length (zero-based length)
+    //  to calculate the context state "lenState" do decode the distance value."
+    u16 length_state = min(normalized_match_length, number_of_length_to_position_states - 1);
+
+    // "At first stage the distance decoder decodes 6-bit "posSlot" value with bit
+    //  tree decoder from PosSlotDecoder array."
+    u16 position_slot = TRY(decode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span()));
+
+    // "The encoding scheme for distance value is shown in the following table:
+    //
+    //  posSlot (decimal) /
+    //       zero-based distance (binary)
+    //  0    0
+    //  1    1
+    //  2    10
+    //  3    11
+    //
+    //  4    10 x
+    //  5    11 x
+    //  6    10 xx
+    //  7    11 xx
+    //  8    10 xxx
+    //  9    11 xxx
+    //  10    10 xxxx
+    //  11    11 xxxx
+    //  12    10 xxxxx
+    //  13    11 xxxxx
+    //
+    //  14    10 yy zzzz
+    //  15    11 yy zzzz
+    //  16    10 yyy zzzz
+    //  17    11 yyy zzzz
+    //  ...
+    //  62    10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
+    //  63    11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
+    //
+    //  where
+    //   "x ... x" means the sequence of binary symbols encoded with binary tree and
+    //       "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13.
+    //   "y" means direct bit encoded with range coder.
+    //   "zzzz" means the sequence of four binary symbols encoded with binary
+    //       tree with "Reverse" scheme, where one common binary tree "AlignDecoder"
+    //       is used for all posSlot values."
+
+    // "If (posSlot < 4), the "dist" value is equal to posSlot value."
+    if (position_slot < first_position_slot_with_binary_tree_bits)
+        return position_slot;
+
+    // From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set.
+    u32 distance_prefix = ((1 << 1) | ((position_slot & 1) << 0));
+
+    // "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of
+    //   the high bits of "dist" value and the number of the low bits.
+    //   If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders.
+    //     (one separated bit tree decoder per one posSlot value) and "Reverse" scheme."
+    if (position_slot < first_position_slot_with_direct_encoded_bits) {
+        size_t number_of_bits_to_decode = (position_slot / 2) - 1;
+        auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits];
+        return (distance_prefix << number_of_bits_to_decode) | TRY(decode_symbol_using_reverse_bit_tree(number_of_bits_to_decode, selected_probability_tree));
+    }
+
+    // "  if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct
+    //     bits from RangeDecoder and the low 4 bits are decoded with a bit tree
+    //     decoder "AlignDecoder" with "Reverse" scheme."
+    size_t number_of_direct_bits_to_decode = ((position_slot - first_position_slot_with_direct_encoded_bits) / 2) + 2;
+    for (size_t i = 0; i < number_of_direct_bits_to_decode; i++) {
+        distance_prefix = (distance_prefix << 1) | TRY(decode_direct_bit());
+    }
+    return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities));
+}
+
+ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
+{
+    while (m_output_buffer.used_space() < bytes.size() && m_output_buffer.empty_space() != 0) {
+        if (m_found_end_of_stream_marker) {
+            if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes < m_options.uncompressed_size.value())
+                return Error::from_string_literal("Found end-of-stream marker earlier than expected");
+
+            break;
+        }
+
+        if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes >= m_options.uncompressed_size.value()) {
+            // FIXME: This should validate that either EOF or the 'end of stream' marker follow immediately.
+            //        Both of those cases count as the 'end of stream' marker being found and should check for a clean decoder state.
+            break;
+        }
+
+        // "The decoder calculates "state2" variable value to select exact variable from
+        //  "IsMatch" and "IsRep0Long" arrays."
+        u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1);
+        u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
+
+        auto update_state_after_literal = [&] {
+            if (m_state < 4)
+                m_state = 0;
+            else if (m_state < 10)
+                m_state -= 3;
+            else
+                m_state -= 6;
+        };
+
+        auto update_state_after_match = [&] {
+            if (m_state < 7)
+                m_state = 7;
+            else
+                m_state = 10;
+        };
+
+        auto update_state_after_rep = [&] {
+            if (m_state < 7)
+                m_state = 8;
+            else
+                m_state = 11;
+        };
+
+        auto update_state_after_short_rep = [&] {
+            if (m_state < 7)
+                m_state = 9;
+            else
+                m_state = 11;
+        };
+
+        auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
+            VERIFY(!m_leftover_match_length.has_value());
+
+            if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_decoded_bytes + real_length)
+                return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size");
+
+            while (real_length > 0) {
+                if (m_output_buffer.empty_space() == 0) {
+                    m_leftover_match_length = real_length;
+                    break;
+                }
+
+                u8 byte;
+                auto read_bytes = TRY(m_output_buffer.read_with_seekback({ &byte, sizeof(byte) }, m_rep0 + 1));
+                VERIFY(read_bytes.size() == sizeof(byte));
+
+                auto written_bytes = m_output_buffer.write({ &byte, sizeof(byte) });
+                VERIFY(written_bytes == sizeof(byte));
+                m_total_decoded_bytes++;
+
+                real_length--;
+            }
+
+            return {};
+        };
+
+        // If we have a leftover part of a repeating match, we should finish that first.
+        if (m_leftover_match_length.has_value()) {
+            TRY(copy_match_to_buffer(m_leftover_match_length.release_value()));
+            continue;
+        }
+
+        // "The decoder uses the following code flow scheme to select exact
+        //  type of LITERAL or MATCH:
+        //
+        //  IsMatch[state2] decode
+        //   0 - the Literal"
+        if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
+            // "At first the LZMA decoder must check that it doesn't exceed
+            //  specified uncompressed size."
+            // This is already checked for at the beginning of the loop.
+
+            // "Then it decodes literal value and puts it to sliding window."
+            TRY(decode_literal_to_output_buffer());
+
+            // "Then the decoder must update the "state" value."
+            update_state_after_literal();
+            continue;
+        }
+
+        // " 1 - the Match
+        //     IsRep[state] decode
+        //       0 - Simple Match"
+        if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) {
+            // "The distance history table is updated with the following scheme:"
+            m_rep3 = m_rep2;
+            m_rep2 = m_rep1;
+            m_rep1 = m_rep0;
+
+            // "The zero-based length is decoded with "LenDecoder"."
+            u16 normalized_length = TRY(decode_normalized_match_length(m_length_decoder));
+
+            // "The state is update with UpdateState_Match function."
+            update_state_after_match();
+
+            // "and the new "rep0" value is decoded with DecodeDistance."
+            m_rep0 = TRY(decode_normalized_match_distance(normalized_length));
+
+            // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have
+            //  "End of stream" marker, so we can stop decoding and check finishing
+            //  condition in Range Decoder"
+            if (m_rep0 == 0xFFFFFFFF) {
+                // The range decoder condition is checked after breaking out of the loop.
+                m_found_end_of_stream_marker = true;
+                continue;
+            }
+
+            // "If uncompressed size is defined, LZMA decoder must check that it doesn't
+            //  exceed that specified uncompressed size."
+            // This is being checked for in the common "copy to buffer" implementation.
+
+            // "Also the decoder must check that "rep0" value is not larger than dictionary size
+            //  and is not larger than the number of already decoded bytes."
+            if (m_rep0 > min(m_options.dictionary_size, m_total_decoded_bytes))
+                return Error::from_string_literal("rep0 value is larger than the possible lookback size");
+
+            // "Then the decoder must copy match bytes as described in
+            //  "The match symbols copying" section."
+            TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
+
+            continue;
+        }
+
+        // "     1 - Rep Match
+        //         IsRepG0[state] decode
+        //           0 - the distance is rep0"
+        if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
+            // "LZMA doesn't update the distance history."
+
+            // "       IsRep0Long[state2] decode
+            //           0 - Short Rep Match"
+            if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) {
+                // "If the subtype is "Short Rep Match", the decoder updates the state, puts
+                //  the one byte from window to current position in window and goes to next
+                //  MATCH/LITERAL symbol."
+                update_state_after_short_rep();
+
+                u8 byte;
+                auto read_bytes = TRY(m_output_buffer.read_with_seekback({ &byte, sizeof(byte) }, m_rep0 + 1));
+                VERIFY(read_bytes.size() == sizeof(byte));
+
+                auto written_bytes = m_output_buffer.write({ &byte, sizeof(byte) });
+                VERIFY(written_bytes == sizeof(byte));
+                m_total_decoded_bytes++;
+
+                continue;
+            }
+            // "         1 - Rep Match 0"
+            // Intentional fallthrough, we just need to make sure to not run the detection for other match types and to not switch around the distance history.
+        } else {
+            // "     1 -
+            //         IsRepG1[state] decode
+            //           0 - Rep Match 1"
+            if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) {
+                u32 distance = m_rep1;
+                m_rep1 = m_rep0;
+                m_rep0 = distance;
+            }
+
+            // "         1 -
+            //             IsRepG2[state] decode
+            //               0 - Rep Match 2"
+            else if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) {
+                u32 distance = m_rep2;
+                m_rep2 = m_rep1;
+                m_rep1 = m_rep0;
+                m_rep0 = distance;
+            }
+
+            // "             1 - Rep Match 3"
+            else {
+                u32 distance = m_rep3;
+                m_rep3 = m_rep2;
+                m_rep2 = m_rep1;
+                m_rep1 = m_rep0;
+                m_rep0 = distance;
+            }
+        }
+
+        // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
+        //  length of match with "RepLenDecoder" decoder."
+        u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_decoder));
+
+        // "Then it updates the state."
+        update_state_after_rep();
+
+        // "Then the decoder must copy match bytes as described in
+        //  "The Match symbols copying" section."
+        TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
+    }
+
+    if (m_found_end_of_stream_marker) {
+        if (m_range_decoder_code != 0)
+            return Error::from_string_literal("LZMA stream ends in an unclean state");
+    }
+
+    return m_output_buffer.read(bytes);
+}
+
+ErrorOr<size_t> LzmaDecompressor::write_some(ReadonlyBytes)
+{
+    return Error::from_errno(EBADF);
+}
+
+bool LzmaDecompressor::is_eof() const
+{
+    if (m_output_buffer.used_space() > 0)
+        return false;
+
+    if (m_options.uncompressed_size.has_value())
+        return m_total_decoded_bytes >= m_options.uncompressed_size.value();
+
+    return m_found_end_of_stream_marker;
+}
+
+bool LzmaDecompressor::is_open() const
+{
+    return true;
+}
+
+void LzmaDecompressor::close()
+{
+}
+
+}
diff --git a/Userland/Libraries/LibCompress/Lzma.h b/Userland/Libraries/LibCompress/Lzma.h
new file mode 100644
index 0000000000..ff9b83afd9
--- /dev/null
+++ b/Userland/Libraries/LibCompress/Lzma.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/CircularBuffer.h>
+#include <AK/FixedArray.h>
+#include <AK/MaybeOwned.h>
+#include <AK/NonnullOwnPtr.h>
+#include <AK/Stream.h>
+
+namespace Compress {
+
+// This implementation is mostly based on the LZMA specification contained in the 7-Zip SDK, which has been placed in the public domain.
+// LZMA Specification Draft (2015): https://www.7-zip.org/a/lzma-specification.7z
+
+struct LzmaDecompressorOptions {
+    u8 literal_context_bits { 0 };
+    u8 literal_position_bits { 0 };
+    u8 position_bits { 0 };
+    u32 dictionary_size { 0 };
+    Optional<u64> uncompressed_size;
+};
+
+// Described in section "lzma file format".
+struct [[gnu::packed]] LzmaHeader {
+    u32 dictionary_size() const;
+    Optional<u64> uncompressed_size() const;
+
+    ErrorOr<LzmaDecompressorOptions> as_decompressor_options() const;
+
+private:
+    ErrorOr<void> decode_model_properties(u8& literal_context_bits, u8& literal_pos_bits, u8& pos_bits) const;
+
+    u8 m_model_properties;
+    u32 m_dictionary_size;
+    u64 m_uncompressed_size;
+};
+static_assert(sizeof(LzmaHeader) == 13);
+
+class LzmaDecompressor : public Stream {
+public:
+    /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive').
+    static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_container(MaybeOwned<Stream>);
+
+    /// Creates a decompressor from a raw stream of LZMA-compressed data (found inside an LZMA container or embedded in other file formats).
+    static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_raw_stream(MaybeOwned<Stream>, LzmaDecompressorOptions const&);
+
+    virtual ErrorOr<Bytes> read_some(Bytes) override;
+    virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
+    virtual bool is_eof() const override;
+    virtual bool is_open() const override;
+    virtual void close() override;
+
+private:
+    // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables.
+    // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048).
+    // The default probability for most counters is 0.5.
+    using Probability = u16;
+    static constexpr size_t probability_bit_count = 11;
+    static constexpr Probability default_probability = (1 << probability_bit_count) / 2;
+    static void initialize_to_default_probability(Span<Probability>);
+
+    LzmaDecompressor(MaybeOwned<Stream>, LzmaDecompressorOptions, CircularBuffer, FixedArray<Probability> literal_probabilities);
+
+    MaybeOwned<Stream> m_stream;
+    LzmaDecompressorOptions m_options;
+
+    CircularBuffer m_output_buffer;
+    u64 m_total_decoded_bytes { 0 };
+    bool m_found_end_of_stream_marker { false };
+    Optional<u16> m_leftover_match_length;
+
+    // Range decoder state (initialized with stream data in LzmaDecompressor::create).
+    u32 m_range_decoder_range { 0xFFFFFFFF };
+    u32 m_range_decoder_code { 0 };
+
+    ErrorOr<void> normalize_range_decoder();
+    ErrorOr<u8> decode_direct_bit();
+    ErrorOr<u8> decode_bit_with_probability(Probability& probability);
+
+    // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order).
+    // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size.
+    ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree);
+    ErrorOr<u16> decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree);
+
+    ErrorOr<void> decode_literal_to_output_buffer();
+    static constexpr size_t literal_probability_table_size = 0x300;
+    FixedArray<Probability> m_literal_probabilities;
+
+    struct LzmaLengthDecoderState {
+    public:
+        LzmaLengthDecoderState();
+
+        Probability m_first_choice_probability { default_probability };
+        Probability m_second_choice_probability { default_probability };
+
+        static constexpr size_t maximum_number_of_position_bits = 4;
+        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_low_length_probabilities;
+        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities;
+        Array<Probability, (1 << 8)> m_high_length_probabilities;
+    };
+
+    LzmaLengthDecoderState m_length_decoder;
+    LzmaLengthDecoderState m_rep_length_decoder;
+    static constexpr u16 normalized_to_real_match_length_offset = 2;
+    ErrorOr<u16> decode_normalized_match_length(LzmaLengthDecoderState&);
+
+    static constexpr size_t number_of_length_to_position_states = 4;
+    Array<Array<Probability, (1 << 6)>, number_of_length_to_position_states> m_length_to_position_states;
+
+    static constexpr size_t first_position_slot_with_binary_tree_bits = 4;
+    static constexpr size_t first_position_slot_with_direct_encoded_bits = 14;
+
+    // This is a bit wasteful on memory and not in the specification, but it makes the math easier.
+    static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits;
+    static constexpr size_t largest_number_of_binary_tree_distance_bits = 5;
+    Array<Array<Probability, (1 << largest_number_of_binary_tree_distance_bits)>, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities;
+
+    static constexpr size_t number_of_alignment_bits = 4;
+    Array<Probability, (1 << number_of_alignment_bits)> m_alignment_bit_probabilities;
+
+    // This deviates from the specification, which states that "unsigned" is at least 16-bit.
+    // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value.
+    static constexpr u32 normalized_to_real_match_distance_offset = 1;
+    ErrorOr<u32> decode_normalized_match_distance(u16 normalized_match_length);
+
+    // LZ state tracking.
+    u16 m_state { 0 };
+    u32 m_rep0 { 0 };
+    u32 m_rep1 { 0 };
+    u32 m_rep2 { 0 };
+    u32 m_rep3 { 0 };
+
+    static constexpr size_t maximum_number_of_position_bits = 4;
+    static constexpr size_t number_of_states = 12;
+    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_match_probabilities;
+    Array<Probability, number_of_states> m_is_rep_probabilities;
+    Array<Probability, number_of_states> m_is_rep_g0_probabilities;
+    Array<Probability, number_of_states> m_is_rep_g1_probabilities;
+    Array<Probability, number_of_states> m_is_rep_g2_probabilities;
+    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
+};
+
+}
+
+template<>
+struct AK::Traits<Compress::LzmaHeader> : public AK::GenericTraits<Compress::LzmaHeader> {
+    static constexpr bool is_trivially_serializable() { return true; }
+};
author	Tim Schumacher <timschumi@gmx.de>	2023-03-05 14:34:05 +0100
committer	Idan Horowitz <idan.horowitz@gmail.com>	2023-03-20 12:15:38 +0200
commit	b3a9729e23a00d40d63b63d7a6c5c8075dc5d481 (patch)
tree	04c4f8a1ec781c074335630264801cb4e8a9cf0a /Userland
parent	5c4ffbcb4654509ce238b3d3dc1edba7ee6c38c9 (diff)
download	serenity-b3a9729e23a00d40d63b63d7a6c5c8075dc5d481.zip