/* * Copyright (c) 2023, Tim Schumacher * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include namespace Compress { ErrorOr XzMultibyteInteger::read_from_stream(Stream& stream) { // 1.2. Multibyte Integers: // "When smaller values are more likely than bigger values (for // example file sizes), multibyte integers are encoded in a // variable-length representation: // - Numbers in the range [0, 127] are copied as is, and take // one byte of space. // - Bigger numbers will occupy two or more bytes. All but the // last byte of the multibyte representation have the highest // (eighth) bit set." // 9 * 7 bits is 63 bits, which is the largest that will fit into an u64. constexpr size_t maximum_number_of_bytes = 9; u64 result = 0; for (size_t i = 0; i < maximum_number_of_bytes; i++) { u64 const next_byte = TRY(stream.read_value()); result |= (next_byte & 0x7F) << (i * 7); // We should reject numbers that are encoded in too many bytes. if (next_byte == 0x00 && i != 0) return Error::from_string_literal("XZ multibyte integer has a larger encoding than necessary"); if ((next_byte & 0x80) == 0) break; } return XzMultibyteInteger { result }; } ErrorOr XzStreamHeader::validate() const { // 2.1.1.1. Header Magic Bytes: // "The first six (6) bytes of the Stream are so called Header // Magic Bytes. They can be used to identify the file type. // // Using a C array and ASCII: // const uint8_t HEADER_MAGIC[6] // = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; // // In plain hexadecimal: // FD 37 7A 58 5A 00 // // If the Header Magic Bytes don't match, the decoder MUST // indicate an error." if (magic[0] != 0xFD || magic[1] != '7' || magic[2] != 'z' || magic[3] != 'X' || magic[4] != 'Z' || magic[5] != 0x00) return Error::from_string_literal("XZ stream header has an invalid magic"); // 2.1.1.2. Stream Flags: // "If any reserved bit is set, the decoder MUST indicate an error. // It is possible that there is a new field present which the // decoder is not aware of, and can thus parse the Stream Header // incorrectly." if (flags.reserved != 0 || flags.reserved_bits != 0) return Error::from_string_literal("XZ stream header has reserved non-null stream flag bits"); // 2.1.1.3. CRC32: // "The CRC32 is calculated from the Stream Flags field. It is // stored as an unsigned 32-bit little endian integer. If the // calculated value does not match the stored one, the decoder // MUST indicate an error." if (Crypto::Checksum::CRC32({ &flags, sizeof(flags) }).digest() != flags_crc32) return Error::from_string_literal("XZ stream header has an invalid CRC32 checksum"); return {}; } ErrorOr XzStreamFooter::validate() const { // 2.1.2.1. CRC32: // "The CRC32 is calculated from the Backward Size and Stream Flags // fields. It is stored as an unsigned 32-bit little endian // integer. If the calculated value does not match the stored one, // the decoder MUST indicate an error." Crypto::Checksum::CRC32 calculated_crc32; calculated_crc32.update({ &encoded_backward_size, sizeof(encoded_backward_size) }); calculated_crc32.update({ &flags, sizeof(flags) }); if (calculated_crc32.digest() != size_and_flags_crc32) return Error::from_string_literal("XZ stream footer has an invalid CRC32 checksum"); // 2.1.2.4. Footer Magic Bytes: // "As the last step of the decoding process, the decoder MUST // verify the existence of Footer Magic Bytes. If they don't // match, an error MUST be indicated. // // Using a C array and ASCII: // const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' }; // // In hexadecimal: // 59 5A" if (magic[0] != 'Y' || magic[1] != 'Z') return Error::from_string_literal("XZ stream footer has an invalid magic"); return {}; } u32 XzStreamFooter::backward_size() const { // 2.1.2.2. Backward Size: // "Backward Size is stored as a 32-bit little endian integer, // which indicates the size of the Index field as multiple of // four bytes, minimum value being four bytes: // // real_backward_size = (stored_backward_size + 1) * 4;" return (encoded_backward_size + 1) * 4; } u8 XzBlockFlags::number_of_filters() const { // 3.1.2. Block Flags: // "Bit(s) Mask Description // 0-1 0x03 Number of filters (1-4)" return encoded_number_of_filters + 1; } ErrorOr XzFilterLzma2Properties::validate() const { // 5.3.1. LZMA2: // "Bits Mask Description // 6-7 0xC0 Reserved for future use; MUST be zero for now." if (reserved != 0) return Error::from_string_literal("XZ LZMA2 filter properties contains non-null reserved bits"); // " const uint8_t bits = get_dictionary_flags() & 0x3F; // if (bits > 40) // return DICTIONARY_TOO_BIG; // Bigger than 4 GiB" if (encoded_dictionary_size > 40) return Error::from_string_literal("XZ LZMA2 filter properties contains larger-than-allowed dictionary size"); return {}; } u32 XzFilterLzma2Properties::dictionary_size() const { // "Dictionary Size is encoded with one-bit mantissa and five-bit // exponent. The smallest dictionary size is 4 KiB and the biggest // is 4 GiB. // Instead of having a table in the decoder, the dictionary size // can be decoded using the following C code:" if (encoded_dictionary_size == 40) return NumericLimits::max(); u32 dictionary_size = 2 | (encoded_dictionary_size & 1); dictionary_size <<= encoded_dictionary_size / 2 + 11; return dictionary_size; } ErrorOr> XzDecompressor::create(MaybeOwned stream) { auto counting_stream = TRY(try_make(move(stream))); auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) XzDecompressor(move(counting_stream)))); return decompressor; } XzDecompressor::XzDecompressor(NonnullOwnPtr stream) : m_stream(move(stream)) { } static Optional size_for_check_type(XzStreamCheckType check_type) { switch (check_type) { case XzStreamCheckType::None: return 0; case XzStreamCheckType::CRC32: return 4; case XzStreamCheckType::CRC64: return 8; case XzStreamCheckType::SHA256: return 32; default: return {}; } } ErrorOr XzDecompressor::load_next_stream() { // If we already determined to have found the last stream footer, there is nothing more to do. if (m_found_last_stream_footer) return false; // This assumes that we can just read the Stream Header into memory as-is. Check that this still holds up for good measure. static_assert(AK::Traits::is_trivially_serializable()); XzStreamHeader stream_header {}; Bytes stream_header_bytes { &stream_header, sizeof(stream_header) }; if (m_found_first_stream_header) { // 2.2. Stream Padding: // "Stream Padding MUST contain only null bytes. To preserve the // four-byte alignment of consecutive Streams, the size of Stream // Padding MUST be a multiple of four bytes. Empty Stream Padding // is allowed. If these requirements are not met, the decoder MUST // indicate an error." VERIFY(m_stream->read_bytes() % 4 == 0); while (true) { // Read the first byte until we either get a non-null byte or reach EOF. auto byte_or_error = m_stream->read_value(); if (byte_or_error.is_error() && m_stream->is_eof()) break; auto byte = TRY(byte_or_error); if (byte != 0) { stream_header_bytes[0] = byte; stream_header_bytes = stream_header_bytes.slice(1); break; } } // If we aren't at EOF we already read the potential first byte of the header, so we need to subtract that. auto end_of_padding_offset = m_stream->read_bytes(); if (!m_stream->is_eof()) end_of_padding_offset -= 1; if (end_of_padding_offset % 4 != 0) return Error::from_string_literal("XZ Stream Padding is not aligned to 4 bytes"); if (m_stream->is_eof()) { m_found_last_stream_footer = true; return false; } } TRY(m_stream->read_until_filled(stream_header_bytes)); TRY(stream_header.validate()); m_stream_flags = stream_header.flags; m_found_first_stream_header = true; return true; } ErrorOr XzDecompressor::load_next_block(u8 encoded_block_header_size) { // We already read the encoded Block Header size (one byte) to determine that this is not an Index. m_current_block_start_offset = m_stream->read_bytes() - 1; // Ensure that the start of the block is aligned to a multiple of four (in theory, everything in XZ is). VERIFY(m_current_block_start_offset % 4 == 0); // 3.1.1. Block Header Size: // "This field contains the size of the Block Header field, // including the Block Header Size field itself. Valid values are // in the range [0x01, 0xFF], which indicate the size of the Block // Header as multiples of four bytes, minimum size being eight // bytes: // // real_header_size = (encoded_header_size + 1) * 4;" u64 const block_header_size = (encoded_block_header_size + 1) * 4; // Read the whole header into a buffer to allow calculating the CRC32 later (3.1.7. CRC32). auto header = TRY(ByteBuffer::create_uninitialized(block_header_size)); header[0] = encoded_block_header_size; TRY(m_stream->read_until_filled(header.span().slice(1))); FixedMemoryStream header_stream { header.span().slice(1) }; // 3.1.2. Block Flags: // "If any reserved bit is set, the decoder MUST indicate an error. // It is possible that there is a new field present which the // decoder is not aware of, and can thus parse the Block Header // incorrectly." auto const flags = TRY(header_stream.read_value()); if (flags.reserved != 0) return Error::from_string_literal("XZ block header has reserved non-null block flag bits"); MaybeOwned new_block_stream { *m_stream }; // 3.1.3. Compressed Size: // "This field is present only if the appropriate bit is set in // the Block Flags field (see Section 3.1.2)." if (flags.compressed_size_present) { // "Compressed Size is stored using the encoding described in Section 1.2." u64 const compressed_size = TRY(header_stream.read_value()); // "The Compressed Size field contains the size of the Compressed // Data field, which MUST be non-zero." if (compressed_size == 0) return Error::from_string_literal("XZ block header contains a compressed size of zero"); new_block_stream = TRY(try_make(move(new_block_stream), compressed_size)); } // 3.1.4. Uncompressed Size: // "This field is present only if the appropriate bit is set in // the Block Flags field (see Section 3.1.2)." if (flags.uncompressed_size_present) { // "Uncompressed Size is stored using the encoding described in Section 1.2." u64 const uncompressed_size = TRY(header_stream.read_value()); m_current_block_expected_uncompressed_size = uncompressed_size; } else { m_current_block_expected_uncompressed_size.clear(); } // 3.1.5. List of Filter Flags: // "The number of Filter Flags fields is stored in the Block Flags // field (see Section 3.1.2)." for (size_t i = 0; i < flags.number_of_filters(); i++) { // "The format of each Filter Flags field is as follows: // Both Filter ID and Size of Properties are stored using the // encoding described in Section 1.2." u64 const filter_id = TRY(header_stream.read_value()); u64 const size_of_properties = TRY(header_stream.read_value()); // "Size of Properties indicates the size of the Filter Properties field as bytes." auto filter_properties = TRY(ByteBuffer::create_uninitialized(size_of_properties)); TRY(header_stream.read_until_filled(filter_properties)); // 5.3.1. LZMA2 if (filter_id == 0x21) { if (size_of_properties < sizeof(XzFilterLzma2Properties)) return Error::from_string_literal("XZ LZMA2 filter has a smaller-than-needed properties size"); auto const* properties = reinterpret_cast(filter_properties.data()); TRY(properties->validate()); new_block_stream = TRY(Lzma2Decompressor::create_from_raw_stream(move(new_block_stream), properties->dictionary_size())); continue; } return Error::from_string_literal("XZ block header contains unknown filter ID"); } // 3.1.6. Header Padding: // "This field contains as many null byte as it is needed to make // the Block Header have the size specified in Block Header Size." constexpr size_t size_of_block_header_size = 1; constexpr size_t size_of_crc32 = 4; while (MUST(header_stream.tell()) < block_header_size - size_of_block_header_size - size_of_crc32) { auto const padding_byte = TRY(header_stream.read_value()); // "If any of the bytes are not null bytes, the decoder MUST // indicate an error." if (padding_byte != 0) return Error::from_string_literal("XZ block header padding contains non-null bytes"); } // 3.1.7. CRC32: // "The CRC32 is calculated over everything in the Block Header // field except the CRC32 field itself. Crypto::Checksum::CRC32 calculated_header_crc32 { header.span().trim(block_header_size - size_of_crc32) }; // It is stored as an unsigned 32-bit little endian integer. u32 const stored_header_crc32 = TRY(header_stream.read_value>()); // If the calculated value does not match the stored one, the decoder MUST indicate // an error." if (calculated_header_crc32.digest() != stored_header_crc32) return Error::from_string_literal("Stored XZ block header CRC32 does not match the stored CRC32"); m_current_block_stream = move(new_block_stream); m_current_block_uncompressed_size = 0; return {}; } ErrorOr XzDecompressor::finish_current_block() { auto unpadded_size = m_stream->read_bytes() - m_current_block_start_offset; // 3.3. Block Padding: // "Block Padding MUST contain 0-3 null bytes to make the size of // the Block a multiple of four bytes. This can be needed when // the size of Compressed Data is not a multiple of four." for (size_t i = 0; (unpadded_size + i) % 4 != 0; i++) { auto const padding_byte = TRY(m_stream->read_value()); // "If any of the bytes in Block Padding are not null bytes, the decoder // MUST indicate an error." if (padding_byte != 0) return Error::from_string_literal("XZ block contains a non-null padding byte"); } // 3.4. Check: // "The type and size of the Check field depends on which bits // are set in the Stream Flags field (see Section 2.1.1.2). // // The Check, when used, is calculated from the original // uncompressed data. If the calculated Check does not match the // stored one, the decoder MUST indicate an error. If the selected // type of Check is not supported by the decoder, it SHOULD // indicate a warning or error." auto const maybe_check_size = size_for_check_type(m_stream_flags->check_type); if (!maybe_check_size.has_value()) return Error::from_string_literal("XZ stream has an unknown check type"); // TODO: Block content checks are currently unimplemented as a whole, independent of the check type. // For now, we only make sure to remove the correct amount of bytes from the stream. TRY(m_stream->discard(*maybe_check_size)); unpadded_size += *maybe_check_size; if (m_current_block_expected_uncompressed_size.has_value()) { if (*m_current_block_expected_uncompressed_size != m_current_block_uncompressed_size) return Error::from_string_literal("Uncompressed size of XZ block does not match the expected value"); } TRY(m_processed_blocks.try_append({ .uncompressed_size = m_current_block_uncompressed_size, .unpadded_size = unpadded_size, })); return {}; } ErrorOr XzDecompressor::finish_current_stream() { // We already read the Index Indicator (one byte) to determine that this is an Index. auto const start_of_current_block = m_stream->read_bytes() - 1; // 4.2. Number of Records: // "This field indicates how many Records there are in the List // of Records field, and thus how many Blocks there are in the // Stream. The value is stored using the encoding described in // Section 1.2." u64 const number_of_records = TRY(m_stream->read_value()); if (m_processed_blocks.size() != number_of_records) return Error::from_string_literal("Number of Records in XZ Index does not match the number of processed Blocks"); // 4.3. List of Records: // "List of Records consists of as many Records as indicated by the // Number of Records field:" for (u64 i = 0; i < number_of_records; i++) { // "Each Record contains information about one Block: // // +===============+===================+ // | Unpadded Size | Uncompressed Size | // +===============+===================+" // 4.3.1. Unpadded Size: // "This field indicates the size of the Block excluding the Block // Padding field. That is, Unpadded Size is the size of the Block // Header, Compressed Data, and Check fields. Unpadded Size is // stored using the encoding described in Section 1.2." u64 const unpadded_size = TRY(m_stream->read_value()); // "The value MUST never be zero; with the current structure of Blocks, the // actual minimum value for Unpadded Size is five." if (unpadded_size < 5) return Error::from_string_literal("XZ index contains a record with an unpadded size of less than five"); // 4.3.2. Uncompressed Size: // "This field indicates the Uncompressed Size of the respective // Block as bytes. The value is stored using the encoding // described in Section 1.2." u64 const uncompressed_size = TRY(m_stream->read_value()); // 4.3. List of Records: // "If the decoder has decoded all the Blocks of the Stream, it // MUST verify that the contents of the Records match the real // Unpadded Size and Uncompressed Size of the respective Blocks." if (m_processed_blocks[i].uncompressed_size != uncompressed_size) return Error::from_string_literal("Uncompressed size of XZ Block does not match the Index"); if (m_processed_blocks[i].unpadded_size != unpadded_size) return Error::from_string_literal("Unpadded size of XZ Block does not match the Index"); } // 4.4. Index Padding: // "This field MUST contain 0-3 null bytes to pad the Index to // a multiple of four bytes. If any of the bytes are not null // bytes, the decoder MUST indicate an error." while ((m_stream->read_bytes() - start_of_current_block) % 4 != 0) { auto padding_byte = TRY(m_stream->read_value()); if (padding_byte != 0) return Error::from_string_literal("XZ index contains a non-null padding byte"); } // 4.5. CRC32: // "The CRC32 is calculated over everything in the Index field // except the CRC32 field itself. The CRC32 is stored as an // unsigned 32-bit little endian integer." u32 const index_crc32 = TRY(m_stream->read_value>()); // "If the calculated value does not match the stored one, the decoder MUST indicate // an error." // TODO: Validation of the index CRC32 is currently unimplemented. (void)index_crc32; auto const size_of_index = m_stream->read_bytes() - start_of_current_block; // According to the specification of a stream (2.1. Stream), the index is the last element in a stream, // followed by the stream footer (2.1.2. Stream Footer). auto const stream_footer = TRY(m_stream->read_value()); // This handles verifying the CRC32 (2.1.2.1. CRC32) and the magic bytes (2.1.2.4. Footer Magic Bytes). TRY(stream_footer.validate()); // 2.1.2.2. Backward Size: // "If the stored value does not match the real size of the Index // field, the decoder MUST indicate an error." if (stream_footer.backward_size() != size_of_index) return Error::from_string_literal("XZ index size does not match the stored size in the stream footer"); // 2.1.2.3. Stream Flags: // "This is a copy of the Stream Flags field from the Stream // Header. The information stored to Stream Flags is needed // when parsing the Stream backwards. The decoder MUST compare // the Stream Flags fields in both Stream Header and Stream // Footer, and indicate an error if they are not identical." if (ReadonlyBytes { &*m_stream_flags, sizeof(XzStreamFlags) } != ReadonlyBytes { &stream_footer.flags, sizeof(stream_footer.flags) }) return Error::from_string_literal("XZ stream header flags don't match the stream footer"); return {}; } ErrorOr XzDecompressor::read_some(Bytes bytes) { if (!m_stream_flags.has_value()) { if (!TRY(load_next_stream())) return bytes.trim(0); } if (!m_current_block_stream.has_value() || (*m_current_block_stream)->is_eof()) { if (m_current_block_stream.has_value()) { // We have already processed a block, so we weed to clean up trailing data before the next block starts. TRY(finish_current_block()); } // The first byte between Block Header (3.1.1. Block Header Size) and Index (4.1. Index Indicator) overlap. // Block header sizes have valid values in the range of [0x01, 0xFF], the only valid value for an Index Indicator is therefore 0x00. auto const encoded_block_header_size_or_index_indicator = TRY(m_stream->read_value()); if (encoded_block_header_size_or_index_indicator == 0x00) { // This is an Index, which is the last element before the stream footer. TRY(finish_current_stream()); // Another XZ Stream might follow, so we just unset the current information and continue on the next read. m_stream_flags.clear(); m_processed_blocks.clear(); return bytes.trim(0); } TRY(load_next_block(encoded_block_header_size_or_index_indicator)); } auto result = TRY((*m_current_block_stream)->read_some(bytes)); m_current_block_uncompressed_size += result.size(); return result; } ErrorOr XzDecompressor::write_some(ReadonlyBytes) { return Error::from_errno(EBADF); } bool XzDecompressor::is_eof() const { return m_found_last_stream_footer; } bool XzDecompressor::is_open() const { return true; } void XzDecompressor::close() { } }