summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibCompress/Lzma.h
blob: af9426a38ce16fd3519d30ae5b20e027aad50765 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/*
 * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include <AK/CircularBuffer.h>
#include <AK/FixedArray.h>
#include <AK/MaybeOwned.h>
#include <AK/NonnullOwnPtr.h>
#include <AK/Stream.h>

namespace Compress {

// This implementation is mostly based on the LZMA specification contained in the 7-Zip SDK, which has been placed in the public domain.
// LZMA Specification Draft (2015): https://www.7-zip.org/a/lzma-specification.7z

struct LzmaModelProperties {
    u8 literal_context_bits;
    u8 literal_position_bits;
    u8 position_bits;
};

struct LzmaDecompressorOptions {
    u8 literal_context_bits { 0 };
    u8 literal_position_bits { 0 };
    u8 position_bits { 0 };
    u32 dictionary_size { 0 };
    Optional<u64> uncompressed_size;
    bool reject_end_of_stream_marker { false };
};

// Described in section "lzma file format".
struct [[gnu::packed]] LzmaHeader {
    u32 dictionary_size() const;
    Optional<u64> uncompressed_size() const;

    ErrorOr<LzmaDecompressorOptions> as_decompressor_options() const;

    static ErrorOr<LzmaModelProperties> decode_model_properties(u8 input_bits);

private:
    u8 m_encoded_model_properties;
    u32 m_dictionary_size;
    u64 m_uncompressed_size;
};
static_assert(sizeof(LzmaHeader) == 13);

class LzmaDecompressor : public Stream {
public:
    /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive').
    static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_container(MaybeOwned<Stream>, Optional<MaybeOwned<CircularBuffer>> dictionary = {});

    /// Creates a decompressor from a raw stream of LZMA-compressed data (found inside an LZMA container or embedded in other file formats).
    static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_raw_stream(MaybeOwned<Stream>, LzmaDecompressorOptions const&, Optional<MaybeOwned<CircularBuffer>> dictionary = {});

    ErrorOr<void> append_input_stream(MaybeOwned<Stream>, Optional<u64> uncompressed_size);

    virtual ErrorOr<Bytes> read_some(Bytes) override;
    virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
    virtual bool is_eof() const override;
    virtual bool is_open() const override;
    virtual void close() override;

private:
    // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables.
    // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048).
    // The default probability for most counters is 0.5.
    using Probability = u16;
    static constexpr size_t probability_bit_count = 11;
    static constexpr Probability default_probability = (1 << probability_bit_count) / 2;
    static void initialize_to_default_probability(Span<Probability>);

    LzmaDecompressor(MaybeOwned<Stream>, LzmaDecompressorOptions, MaybeOwned<CircularBuffer>, FixedArray<Probability> literal_probabilities);

    MaybeOwned<Stream> m_stream;
    LzmaDecompressorOptions m_options;

    // This doubles as an output buffer, since we have to write all of our results into this anyways.
    MaybeOwned<CircularBuffer> m_dictionary;
    u64 m_total_decoded_bytes { 0 };
    bool m_found_end_of_stream_marker { false };
    bool is_range_decoder_in_clean_state() const;
    bool has_reached_expected_data_size() const;
    Optional<u16> m_leftover_match_length;

    // Range decoder state (initialized with stream data in LzmaDecompressor::create).
    u32 m_range_decoder_range { 0xFFFFFFFF };
    u32 m_range_decoder_code { 0 };

    ErrorOr<void> initialize_range_decoder();
    ErrorOr<void> normalize_range_decoder();
    ErrorOr<u8> decode_direct_bit();
    ErrorOr<u8> decode_bit_with_probability(Probability& probability);

    // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order).
    // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size.
    ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree);
    ErrorOr<u16> decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree);

    ErrorOr<void> decode_literal_to_output_buffer();
    static constexpr size_t literal_probability_table_size = 0x300;
    FixedArray<Probability> m_literal_probabilities;

    struct LzmaLengthDecoderState {
    public:
        LzmaLengthDecoderState();

        Probability m_first_choice_probability { default_probability };
        Probability m_second_choice_probability { default_probability };

        static constexpr size_t maximum_number_of_position_bits = 4;
        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_low_length_probabilities;
        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities;
        Array<Probability, (1 << 8)> m_high_length_probabilities;
    };

    LzmaLengthDecoderState m_length_decoder;
    LzmaLengthDecoderState m_rep_length_decoder;
    static constexpr u16 normalized_to_real_match_length_offset = 2;
    ErrorOr<u16> decode_normalized_match_length(LzmaLengthDecoderState&);

    static constexpr size_t number_of_length_to_position_states = 4;
    Array<Array<Probability, (1 << 6)>, number_of_length_to_position_states> m_length_to_position_states;

    static constexpr size_t first_position_slot_with_binary_tree_bits = 4;
    static constexpr size_t first_position_slot_with_direct_encoded_bits = 14;

    // This is a bit wasteful on memory and not in the specification, but it makes the math easier.
    static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits;
    static constexpr size_t largest_number_of_binary_tree_distance_bits = 5;
    Array<Array<Probability, (1 << largest_number_of_binary_tree_distance_bits)>, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities;

    static constexpr size_t number_of_alignment_bits = 4;
    Array<Probability, (1 << number_of_alignment_bits)> m_alignment_bit_probabilities;

    // This deviates from the specification, which states that "unsigned" is at least 16-bit.
    // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value.
    static constexpr u32 normalized_to_real_match_distance_offset = 1;
    ErrorOr<u32> decode_normalized_match_distance(u16 normalized_match_length);

    // LZ state tracking.
    u16 m_state { 0 };
    u32 m_rep0 { 0 };
    u32 m_rep1 { 0 };
    u32 m_rep2 { 0 };
    u32 m_rep3 { 0 };
    u32 current_repetition_offset() const;

    static constexpr size_t maximum_number_of_position_bits = 4;
    static constexpr size_t number_of_states = 12;
    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_match_probabilities;
    Array<Probability, number_of_states> m_is_rep_probabilities;
    Array<Probability, number_of_states> m_is_rep_g0_probabilities;
    Array<Probability, number_of_states> m_is_rep_g1_probabilities;
    Array<Probability, number_of_states> m_is_rep_g2_probabilities;
    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
};

}

template<>
struct AK::Traits<Compress::LzmaHeader> : public AK::GenericTraits<Compress::LzmaHeader> {
    static constexpr bool is_trivially_serializable() { return true; }
};