diff options
author | Tim Schumacher <timschumi@gmx.de> | 2023-03-05 14:34:05 +0100 |
---|---|---|
committer | Idan Horowitz <idan.horowitz@gmail.com> | 2023-03-20 12:15:38 +0200 |
commit | b3a9729e23a00d40d63b63d7a6c5c8075dc5d481 (patch) | |
tree | 04c4f8a1ec781c074335630264801cb4e8a9cf0a /Userland | |
parent | 5c4ffbcb4654509ce238b3d3dc1edba7ee6c38c9 (diff) | |
download | serenity-b3a9729e23a00d40d63b63d7a6c5c8075dc5d481.zip |
LibCompress: Add support for LZMA streams
Diffstat (limited to 'Userland')
-rw-r--r-- | Userland/Libraries/LibCompress/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibCompress/Lzma.cpp | 658 | ||||
-rw-r--r-- | Userland/Libraries/LibCompress/Lzma.h | 153 |
3 files changed, 812 insertions, 0 deletions
diff --git a/Userland/Libraries/LibCompress/CMakeLists.txt b/Userland/Libraries/LibCompress/CMakeLists.txt index fbc4fe2f62..c30ffd0397 100644 --- a/Userland/Libraries/LibCompress/CMakeLists.txt +++ b/Userland/Libraries/LibCompress/CMakeLists.txt @@ -2,6 +2,7 @@ set(SOURCES Brotli.cpp BrotliDictionary.cpp Deflate.cpp + Lzma.cpp Zlib.cpp Gzip.cpp ) diff --git a/Userland/Libraries/LibCompress/Lzma.cpp b/Userland/Libraries/LibCompress/Lzma.cpp new file mode 100644 index 0000000000..ccd3440f85 --- /dev/null +++ b/Userland/Libraries/LibCompress/Lzma.cpp @@ -0,0 +1,658 @@ +/* + * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <LibCompress/Lzma.h> + +namespace Compress { + +u32 LzmaHeader::dictionary_size() const +{ + // "If the value of dictionary size in properties is smaller than (1 << 12), + // the LZMA decoder must set the dictionary size variable to (1 << 12)." + constexpr u32 minimum_dictionary_size = (1 << 12); + if (m_dictionary_size < minimum_dictionary_size) + return minimum_dictionary_size; + + return m_dictionary_size; +} + +Optional<u64> LzmaHeader::uncompressed_size() const +{ + // We are making a copy of the packed field here because we would otherwise + // pass an unaligned reference to the constructor of Optional, which is + // undefined behavior. + auto uncompressed_size = m_uncompressed_size; + + // "If "Uncompressed size" field contains ones in all 64 bits, it means that + // uncompressed size is unknown and there is the "end marker" in stream, + // that indicates the end of decoding point." + if (uncompressed_size == UINT64_MAX) + return {}; + + // "In opposite case, if the value from "Uncompressed size" field is not + // equal to ((2^64) - 1), the LZMA stream decoding must be finished after + // specified number of bytes (Uncompressed size) is decoded. And if there + // is the "end marker", the LZMA decoder must read that marker also." + return uncompressed_size; +} + +ErrorOr<void> LzmaHeader::decode_model_properties(u8& literal_context_bits, u8& literal_pos_bits, u8& pos_bits) const +{ + // "Decodes the following values from the encoded model properties field: + // + // name Range Description + // lc [0, 8] the number of "literal context" bits + // lp [0, 4] the number of "literal pos" bits + // pb [0, 4] the number of "pos" bits + // + // Encoded using `((pb * 5 + lp) * 9 + lc)`." + + u8 input_bits = m_model_properties; + + if (input_bits >= (9 * 5 * 5)) + return Error::from_string_literal("Encoded model properties value is larger than the highest possible value"); + + literal_context_bits = input_bits % 9; + input_bits /= 9; + + literal_pos_bits = input_bits % 5; + input_bits /= 5; + + pos_bits = input_bits; + + return {}; +} + +ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const +{ + u8 literal_context_bits { 0 }; + u8 literal_position_bits { 0 }; + u8 position_bits { 0 }; + + TRY(decode_model_properties(literal_context_bits, literal_position_bits, position_bits)); + + return LzmaDecompressorOptions { + .literal_context_bits = literal_context_bits, + .literal_position_bits = literal_position_bits, + .position_bits = position_bits, + .dictionary_size = dictionary_size(), + .uncompressed_size = uncompressed_size(), + }; +} + +void LzmaDecompressor::initialize_to_default_probability(Span<Probability> span) +{ + for (auto& entry : span) + entry = default_probability; +} + +ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_container(MaybeOwned<Stream> stream) +{ + auto header = TRY(stream->read_value<LzmaHeader>()); + + return TRY(LzmaDecompressor::create_from_raw_stream(move(stream), TRY(header.as_decompressor_options()))); +} + +ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_raw_stream(MaybeOwned<Stream> stream, LzmaDecompressorOptions const& options) +{ + // "The LZMA Encoder always writes ZERO in initial byte of compressed stream. + // That scheme allows to simplify the code of the Range Encoder in the + // LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must + // stop decoding and report error." + { + auto byte = TRY(stream->read_value<u8>()); + if (byte != 0) + return Error::from_string_literal("Initial byte of data stream is not zero"); + } + + auto output_buffer = TRY(CircularBuffer::create_empty(options.dictionary_size)); + + // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values." + auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits)))); + + auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaDecompressor(move(stream), options, move(output_buffer), move(literal_probabilities)))); + + // Read the initial bytes into the range decoder. + for (size_t i = 0; i < 4; i++) { + auto byte = TRY(decompressor->m_stream->read_value<u8>()); + decompressor->m_range_decoder_code = decompressor->m_range_decoder_code << 8 | byte; + } + + return decompressor; +} + +LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, CircularBuffer output_buffer, FixedArray<Probability> literal_probabilities) + : m_stream(move(stream)) + , m_options(move(options)) + , m_output_buffer(move(output_buffer)) + , m_literal_probabilities(move(literal_probabilities)) +{ + initialize_to_default_probability(m_literal_probabilities.span()); + + for (auto& array : m_length_to_position_states) + initialize_to_default_probability(array); + + for (auto& array : m_binary_tree_distance_probabilities) + initialize_to_default_probability(array); + + initialize_to_default_probability(m_alignment_bit_probabilities); + + initialize_to_default_probability(m_is_match_probabilities); + initialize_to_default_probability(m_is_rep_probabilities); + initialize_to_default_probability(m_is_rep_g0_probabilities); + initialize_to_default_probability(m_is_rep_g1_probabilities); + initialize_to_default_probability(m_is_rep_g2_probabilities); + initialize_to_default_probability(m_is_rep0_long_probabilities); +} + +ErrorOr<void> LzmaDecompressor::normalize_range_decoder() +{ + // "The value of the "Range" variable before each bit decoding can not be smaller + // than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in + // described range." + constexpr u32 minimum_range_value = 1 << 24; + + if (m_range_decoder_range >= minimum_range_value) + return {}; + + m_range_decoder_range <<= 8; + m_range_decoder_code <<= 8; + + m_range_decoder_code |= TRY(m_stream->read_value<u8>()); + + VERIFY(m_range_decoder_range >= minimum_range_value); + + return {}; +} + +ErrorOr<u8> LzmaDecompressor::decode_direct_bit() +{ + m_range_decoder_range >>= 1; + m_range_decoder_code -= m_range_decoder_range; + + u32 temp = 0 - (m_range_decoder_code >> 31); + + m_range_decoder_code += m_range_decoder_range & temp; + + if (m_range_decoder_code == m_range_decoder_range) + return Error::from_string_literal("Reached an invalid state while decoding LZMA stream"); + + TRY(normalize_range_decoder()); + + return temp + 1; +} + +ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probability) +{ + // "The LZMA decoder provides the pointer to CProb variable that contains + // information about estimated probability for symbol 0 and the Range Decoder + // updates that CProb variable after decoding." + + // The significance of the shift width is not explained and appears to be a magic constant. + constexpr size_t probability_shift_width = 5; + + u32 bound = (m_range_decoder_range >> probability_bit_count) * probability; + + if (m_range_decoder_code < bound) { + probability += ((1 << probability_bit_count) - probability) >> probability_shift_width; + m_range_decoder_range = bound; + TRY(normalize_range_decoder()); + return 0; + } else { + probability -= probability >> probability_shift_width; + m_range_decoder_code -= bound; + m_range_decoder_range -= bound; + TRY(normalize_range_decoder()); + return 1; + } +} + +ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree) +{ + VERIFY(bit_count <= sizeof(u16) * 8); + VERIFY(probability_tree.size() >= 1ul << bit_count); + + // This has been modified from the reference implementation to unlink the result and the tree index, + // which should allow for better readability. + + u16 result = 0; + size_t tree_index = 1; + + for (size_t i = 0; i < bit_count; i++) { + u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index])); + result = (result << 1) | next_bit; + tree_index = (tree_index << 1) | next_bit; + } + + return result; +} + +ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree) +{ + VERIFY(bit_count <= sizeof(u16) * 8); + VERIFY(probability_tree.size() >= 1ul << bit_count); + + u16 result = 0; + size_t tree_index = 1; + + for (size_t i = 0; i < bit_count; i++) { + u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index])); + result |= next_bit << i; + tree_index = (tree_index << 1) | next_bit; + } + + return result; +} + +ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer() +{ + u8 previous_byte = 0; + if (m_total_decoded_bytes > 0) { + auto read_bytes = MUST(m_output_buffer.read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 1)); + VERIFY(read_bytes.size() == sizeof(previous_byte)); + } + + // "To select the table for decoding it uses the context that consists of + // (lc) high bits from previous literal and (lp) low bits from value that + // represents current position in outputStream." + u16 literal_state_bits_from_position = m_total_decoded_bytes & ((1 << m_options.literal_position_bits) - 1); + u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits); + u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output; + + Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size); + + // The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer. + // The top bit is only used to track how much we have decoded already, and to select the correct probability table. + u16 result = 1; + + // "If (State > 7), the Literal Decoder also uses "matchByte" that represents + // the byte in OutputStream at position the is the DISTANCE bytes before + // current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair + // of latest decoded match." + // Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch. + // Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one. + if (m_state >= 7) { + u8 matched_byte = 0; + auto read_bytes = TRY(m_output_buffer.read_with_seekback({ &matched_byte, sizeof(matched_byte) }, m_rep0 + 1)); + VERIFY(read_bytes.size() == sizeof(matched_byte)); + + do { + u8 match_bit = (matched_byte >> 7) & 1; + matched_byte <<= 1; + + u8 decoded_bit = TRY(decode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result])); + result = result << 1 | decoded_bit; + + if (match_bit != decoded_bit) + break; + } while (result < 0x100); + } + + while (result < 0x100) + result = (result << 1) | TRY(decode_bit_with_probability(selected_probability_table[result])); + + u8 actual_result = result - 0x100; + + size_t written_bytes = m_output_buffer.write({ &actual_result, sizeof(actual_result) }); + VERIFY(written_bytes == sizeof(actual_result)); + m_total_decoded_bytes += sizeof(actual_result); + + return {}; +} + +LzmaDecompressor::LzmaLengthDecoderState::LzmaLengthDecoderState() +{ + for (auto& array : m_low_length_probabilities) + initialize_to_default_probability(array); + + for (auto& array : m_medium_length_probabilities) + initialize_to_default_probability(array); + + initialize_to_default_probability(m_high_length_probabilities); +} + +ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthDecoderState& length_decoder_state) +{ + // "LZMA uses "posState" value as context to select the binary tree + // from LowCoder and MidCoder binary tree arrays:" + u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1); + + // "The following scheme is used for the match length encoding: + // + // Binary encoding Binary Tree structure Zero-based match length + // sequence (binary + decimal): + // + // 0 xxx LowCoder[posState] xxx + if (TRY(decode_bit_with_probability(length_decoder_state.m_first_choice_probability)) == 0) + return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_low_length_probabilities[position_state].span())); + + // 1 0 yyy MidCoder[posState] yyy + 8 + if (TRY(decode_bit_with_probability(length_decoder_state.m_second_choice_probability)) == 0) + return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_medium_length_probabilities[position_state].span())) + 8; + + // 1 1 zzzzzzzz HighCoder zzzzzzzz + 16" + return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16; +} + +ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length) +{ + // "LZMA uses normalized match length (zero-based length) + // to calculate the context state "lenState" do decode the distance value." + u16 length_state = min(normalized_match_length, number_of_length_to_position_states - 1); + + // "At first stage the distance decoder decodes 6-bit "posSlot" value with bit + // tree decoder from PosSlotDecoder array." + u16 position_slot = TRY(decode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span())); + + // "The encoding scheme for distance value is shown in the following table: + // + // posSlot (decimal) / + // zero-based distance (binary) + // 0 0 + // 1 1 + // 2 10 + // 3 11 + // + // 4 10 x + // 5 11 x + // 6 10 xx + // 7 11 xx + // 8 10 xxx + // 9 11 xxx + // 10 10 xxxx + // 11 11 xxxx + // 12 10 xxxxx + // 13 11 xxxxx + // + // 14 10 yy zzzz + // 15 11 yy zzzz + // 16 10 yyy zzzz + // 17 11 yyy zzzz + // ... + // 62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz + // 63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz + // + // where + // "x ... x" means the sequence of binary symbols encoded with binary tree and + // "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13. + // "y" means direct bit encoded with range coder. + // "zzzz" means the sequence of four binary symbols encoded with binary + // tree with "Reverse" scheme, where one common binary tree "AlignDecoder" + // is used for all posSlot values." + + // "If (posSlot < 4), the "dist" value is equal to posSlot value." + if (position_slot < first_position_slot_with_binary_tree_bits) + return position_slot; + + // From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set. + u32 distance_prefix = ((1 << 1) | ((position_slot & 1) << 0)); + + // "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of + // the high bits of "dist" value and the number of the low bits. + // If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders. + // (one separated bit tree decoder per one posSlot value) and "Reverse" scheme." + if (position_slot < first_position_slot_with_direct_encoded_bits) { + size_t number_of_bits_to_decode = (position_slot / 2) - 1; + auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits]; + return (distance_prefix << number_of_bits_to_decode) | TRY(decode_symbol_using_reverse_bit_tree(number_of_bits_to_decode, selected_probability_tree)); + } + + // " if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct + // bits from RangeDecoder and the low 4 bits are decoded with a bit tree + // decoder "AlignDecoder" with "Reverse" scheme." + size_t number_of_direct_bits_to_decode = ((position_slot - first_position_slot_with_direct_encoded_bits) / 2) + 2; + for (size_t i = 0; i < number_of_direct_bits_to_decode; i++) { + distance_prefix = (distance_prefix << 1) | TRY(decode_direct_bit()); + } + return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities)); +} + +ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes) +{ + while (m_output_buffer.used_space() < bytes.size() && m_output_buffer.empty_space() != 0) { + if (m_found_end_of_stream_marker) { + if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes < m_options.uncompressed_size.value()) + return Error::from_string_literal("Found end-of-stream marker earlier than expected"); + + break; + } + + if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes >= m_options.uncompressed_size.value()) { + // FIXME: This should validate that either EOF or the 'end of stream' marker follow immediately. + // Both of those cases count as the 'end of stream' marker being found and should check for a clean decoder state. + break; + } + + // "The decoder calculates "state2" variable value to select exact variable from + // "IsMatch" and "IsRep0Long" arrays." + u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1); + u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; + + auto update_state_after_literal = [&] { + if (m_state < 4) + m_state = 0; + else if (m_state < 10) + m_state -= 3; + else + m_state -= 6; + }; + + auto update_state_after_match = [&] { + if (m_state < 7) + m_state = 7; + else + m_state = 10; + }; + + auto update_state_after_rep = [&] { + if (m_state < 7) + m_state = 8; + else + m_state = 11; + }; + + auto update_state_after_short_rep = [&] { + if (m_state < 7) + m_state = 9; + else + m_state = 11; + }; + + auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> { + VERIFY(!m_leftover_match_length.has_value()); + + if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_decoded_bytes + real_length) + return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size"); + + while (real_length > 0) { + if (m_output_buffer.empty_space() == 0) { + m_leftover_match_length = real_length; + break; + } + + u8 byte; + auto read_bytes = TRY(m_output_buffer.read_with_seekback({ &byte, sizeof(byte) }, m_rep0 + 1)); + VERIFY(read_bytes.size() == sizeof(byte)); + + auto written_bytes = m_output_buffer.write({ &byte, sizeof(byte) }); + VERIFY(written_bytes == sizeof(byte)); + m_total_decoded_bytes++; + + real_length--; + } + + return {}; + }; + + // If we have a leftover part of a repeating match, we should finish that first. + if (m_leftover_match_length.has_value()) { + TRY(copy_match_to_buffer(m_leftover_match_length.release_value())); + continue; + } + + // "The decoder uses the following code flow scheme to select exact + // type of LITERAL or MATCH: + // + // IsMatch[state2] decode + // 0 - the Literal" + if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) { + // "At first the LZMA decoder must check that it doesn't exceed + // specified uncompressed size." + // This is already checked for at the beginning of the loop. + + // "Then it decodes literal value and puts it to sliding window." + TRY(decode_literal_to_output_buffer()); + + // "Then the decoder must update the "state" value." + update_state_after_literal(); + continue; + } + + // " 1 - the Match + // IsRep[state] decode + // 0 - Simple Match" + if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) { + // "The distance history table is updated with the following scheme:" + m_rep3 = m_rep2; + m_rep2 = m_rep1; + m_rep1 = m_rep0; + + // "The zero-based length is decoded with "LenDecoder"." + u16 normalized_length = TRY(decode_normalized_match_length(m_length_decoder)); + + // "The state is update with UpdateState_Match function." + update_state_after_match(); + + // "and the new "rep0" value is decoded with DecodeDistance." + m_rep0 = TRY(decode_normalized_match_distance(normalized_length)); + + // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have + // "End of stream" marker, so we can stop decoding and check finishing + // condition in Range Decoder" + if (m_rep0 == 0xFFFFFFFF) { + // The range decoder condition is checked after breaking out of the loop. + m_found_end_of_stream_marker = true; + continue; + } + + // "If uncompressed size is defined, LZMA decoder must check that it doesn't + // exceed that specified uncompressed size." + // This is being checked for in the common "copy to buffer" implementation. + + // "Also the decoder must check that "rep0" value is not larger than dictionary size + // and is not larger than the number of already decoded bytes." + if (m_rep0 > min(m_options.dictionary_size, m_total_decoded_bytes)) + return Error::from_string_literal("rep0 value is larger than the possible lookback size"); + + // "Then the decoder must copy match bytes as described in + // "The match symbols copying" section." + TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset)); + + continue; + } + + // " 1 - Rep Match + // IsRepG0[state] decode + // 0 - the distance is rep0" + if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) { + // "LZMA doesn't update the distance history." + + // " IsRep0Long[state2] decode + // 0 - Short Rep Match" + if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) { + // "If the subtype is "Short Rep Match", the decoder updates the state, puts + // the one byte from window to current position in window and goes to next + // MATCH/LITERAL symbol." + update_state_after_short_rep(); + + u8 byte; + auto read_bytes = TRY(m_output_buffer.read_with_seekback({ &byte, sizeof(byte) }, m_rep0 + 1)); + VERIFY(read_bytes.size() == sizeof(byte)); + + auto written_bytes = m_output_buffer.write({ &byte, sizeof(byte) }); + VERIFY(written_bytes == sizeof(byte)); + m_total_decoded_bytes++; + + continue; + } + // " 1 - Rep Match 0" + // Intentional fallthrough, we just need to make sure to not run the detection for other match types and to not switch around the distance history. + } else { + // " 1 - + // IsRepG1[state] decode + // 0 - Rep Match 1" + if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) { + u32 distance = m_rep1; + m_rep1 = m_rep0; + m_rep0 = distance; + } + + // " 1 - + // IsRepG2[state] decode + // 0 - Rep Match 2" + else if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) { + u32 distance = m_rep2; + m_rep2 = m_rep1; + m_rep1 = m_rep0; + m_rep0 = distance; + } + + // " 1 - Rep Match 3" + else { + u32 distance = m_rep3; + m_rep3 = m_rep2; + m_rep2 = m_rep1; + m_rep1 = m_rep0; + m_rep0 = distance; + } + } + + // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based + // length of match with "RepLenDecoder" decoder." + u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_decoder)); + + // "Then it updates the state." + update_state_after_rep(); + + // "Then the decoder must copy match bytes as described in + // "The Match symbols copying" section." + TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset)); + } + + if (m_found_end_of_stream_marker) { + if (m_range_decoder_code != 0) + return Error::from_string_literal("LZMA stream ends in an unclean state"); + } + + return m_output_buffer.read(bytes); +} + +ErrorOr<size_t> LzmaDecompressor::write_some(ReadonlyBytes) +{ + return Error::from_errno(EBADF); +} + +bool LzmaDecompressor::is_eof() const +{ + if (m_output_buffer.used_space() > 0) + return false; + + if (m_options.uncompressed_size.has_value()) + return m_total_decoded_bytes >= m_options.uncompressed_size.value(); + + return m_found_end_of_stream_marker; +} + +bool LzmaDecompressor::is_open() const +{ + return true; +} + +void LzmaDecompressor::close() +{ +} + +} diff --git a/Userland/Libraries/LibCompress/Lzma.h b/Userland/Libraries/LibCompress/Lzma.h new file mode 100644 index 0000000000..ff9b83afd9 --- /dev/null +++ b/Userland/Libraries/LibCompress/Lzma.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/CircularBuffer.h> +#include <AK/FixedArray.h> +#include <AK/MaybeOwned.h> +#include <AK/NonnullOwnPtr.h> +#include <AK/Stream.h> + +namespace Compress { + +// This implementation is mostly based on the LZMA specification contained in the 7-Zip SDK, which has been placed in the public domain. +// LZMA Specification Draft (2015): https://www.7-zip.org/a/lzma-specification.7z + +struct LzmaDecompressorOptions { + u8 literal_context_bits { 0 }; + u8 literal_position_bits { 0 }; + u8 position_bits { 0 }; + u32 dictionary_size { 0 }; + Optional<u64> uncompressed_size; +}; + +// Described in section "lzma file format". +struct [[gnu::packed]] LzmaHeader { + u32 dictionary_size() const; + Optional<u64> uncompressed_size() const; + + ErrorOr<LzmaDecompressorOptions> as_decompressor_options() const; + +private: + ErrorOr<void> decode_model_properties(u8& literal_context_bits, u8& literal_pos_bits, u8& pos_bits) const; + + u8 m_model_properties; + u32 m_dictionary_size; + u64 m_uncompressed_size; +}; +static_assert(sizeof(LzmaHeader) == 13); + +class LzmaDecompressor : public Stream { +public: + /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive'). + static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_container(MaybeOwned<Stream>); + + /// Creates a decompressor from a raw stream of LZMA-compressed data (found inside an LZMA container or embedded in other file formats). + static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_raw_stream(MaybeOwned<Stream>, LzmaDecompressorOptions const&); + + virtual ErrorOr<Bytes> read_some(Bytes) override; + virtual ErrorOr<size_t> write_some(ReadonlyBytes) override; + virtual bool is_eof() const override; + virtual bool is_open() const override; + virtual void close() override; + +private: + // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables. + // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048). + // The default probability for most counters is 0.5. + using Probability = u16; + static constexpr size_t probability_bit_count = 11; + static constexpr Probability default_probability = (1 << probability_bit_count) / 2; + static void initialize_to_default_probability(Span<Probability>); + + LzmaDecompressor(MaybeOwned<Stream>, LzmaDecompressorOptions, CircularBuffer, FixedArray<Probability> literal_probabilities); + + MaybeOwned<Stream> m_stream; + LzmaDecompressorOptions m_options; + + CircularBuffer m_output_buffer; + u64 m_total_decoded_bytes { 0 }; + bool m_found_end_of_stream_marker { false }; + Optional<u16> m_leftover_match_length; + + // Range decoder state (initialized with stream data in LzmaDecompressor::create). + u32 m_range_decoder_range { 0xFFFFFFFF }; + u32 m_range_decoder_code { 0 }; + + ErrorOr<void> normalize_range_decoder(); + ErrorOr<u8> decode_direct_bit(); + ErrorOr<u8> decode_bit_with_probability(Probability& probability); + + // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order). + // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size. + ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree); + ErrorOr<u16> decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree); + + ErrorOr<void> decode_literal_to_output_buffer(); + static constexpr size_t literal_probability_table_size = 0x300; + FixedArray<Probability> m_literal_probabilities; + + struct LzmaLengthDecoderState { + public: + LzmaLengthDecoderState(); + + Probability m_first_choice_probability { default_probability }; + Probability m_second_choice_probability { default_probability }; + + static constexpr size_t maximum_number_of_position_bits = 4; + Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_low_length_probabilities; + Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities; + Array<Probability, (1 << 8)> m_high_length_probabilities; + }; + + LzmaLengthDecoderState m_length_decoder; + LzmaLengthDecoderState m_rep_length_decoder; + static constexpr u16 normalized_to_real_match_length_offset = 2; + ErrorOr<u16> decode_normalized_match_length(LzmaLengthDecoderState&); + + static constexpr size_t number_of_length_to_position_states = 4; + Array<Array<Probability, (1 << 6)>, number_of_length_to_position_states> m_length_to_position_states; + + static constexpr size_t first_position_slot_with_binary_tree_bits = 4; + static constexpr size_t first_position_slot_with_direct_encoded_bits = 14; + + // This is a bit wasteful on memory and not in the specification, but it makes the math easier. + static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits; + static constexpr size_t largest_number_of_binary_tree_distance_bits = 5; + Array<Array<Probability, (1 << largest_number_of_binary_tree_distance_bits)>, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities; + + static constexpr size_t number_of_alignment_bits = 4; + Array<Probability, (1 << number_of_alignment_bits)> m_alignment_bit_probabilities; + + // This deviates from the specification, which states that "unsigned" is at least 16-bit. + // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value. + static constexpr u32 normalized_to_real_match_distance_offset = 1; + ErrorOr<u32> decode_normalized_match_distance(u16 normalized_match_length); + + // LZ state tracking. + u16 m_state { 0 }; + u32 m_rep0 { 0 }; + u32 m_rep1 { 0 }; + u32 m_rep2 { 0 }; + u32 m_rep3 { 0 }; + + static constexpr size_t maximum_number_of_position_bits = 4; + static constexpr size_t number_of_states = 12; + Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_match_probabilities; + Array<Probability, number_of_states> m_is_rep_probabilities; + Array<Probability, number_of_states> m_is_rep_g0_probabilities; + Array<Probability, number_of_states> m_is_rep_g1_probabilities; + Array<Probability, number_of_states> m_is_rep_g2_probabilities; + Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities; +}; + +} + +template<> +struct AK::Traits<Compress::LzmaHeader> : public AK::GenericTraits<Compress::LzmaHeader> { + static constexpr bool is_trivially_serializable() { return true; } +}; |