summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibPDF/DocumentParser.h
blob: fdcb06d886a08ea6e5f03d140c857f4e30dac02e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/*
 * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include <LibPDF/Parser.h>

namespace PDF {

class DocumentParser final : public RefCounted<DocumentParser>
    , public Parser {
public:
    DocumentParser(Document*, ReadonlyBytes);

    enum class LinearizationResult {
        NotLinearized,
        Linearized,
    };

    [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }

    // Parses the header and initializes the xref table and trailer
    PDFErrorOr<void> initialize();

    PDFErrorOr<Value> parse_object_with_index(u32 index);

    // Specialized version of parse_dict which aborts early if the dict being parsed
    // is not a page object
    PDFErrorOr<RefPtr<DictObject>> conditionally_parse_page_tree_node(u32 object_index);

private:
    struct LinearizationDictionary {
        u32 length_of_file { 0 };
        u32 primary_hint_stream_offset { 0 };
        u32 primary_hint_stream_length { 0 };
        u32 overflow_hint_stream_offset { 0 };
        u32 overflow_hint_stream_length { 0 };
        u32 first_page_object_number { 0 };
        u32 offset_of_first_page_end { 0 };
        u16 number_of_pages { 0 };
        u32 offset_of_main_xref_table { 0 };
        u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here)
    };

    struct PageOffsetHintTable {
        u32 least_number_of_objects_in_a_page { 0 };
        u32 location_of_first_page_object { 0 };
        u16 bits_required_for_object_number { 0 };
        u32 least_length_of_a_page { 0 };
        u16 bits_required_for_page_length { 0 };
        u32 least_offset_of_any_content_stream { 0 };
        u16 bits_required_for_content_stream_offsets { 0 };
        u32 least_content_stream_length { 0 };
        u16 bits_required_for_content_stream_length { 0 };
        u16 bits_required_for_number_of_shared_obj_refs { 0 };
        u16 bits_required_for_greatest_shared_obj_identifier { 0 };
        u16 bits_required_for_fraction_numerator { 0 };
        u16 shared_object_reference_fraction_denominator { 0 };
    };

    struct PageOffsetHintTableEntry {
        u32 objects_in_page_number { 0 };
        u32 page_length_number { 0 };
        u32 number_of_shared_objects { 0 };
        Vector<u32> shared_object_identifiers {};
        Vector<u32> shared_object_location_numerators {};
        u32 page_content_stream_offset_number { 0 };
        u32 page_content_stream_length_number { 0 };
    };

    friend struct AK::Formatter<LinearizationDictionary>;
    friend struct AK::Formatter<PageOffsetHintTable>;
    friend struct AK::Formatter<PageOffsetHintTableEntry>;

    PDFErrorOr<void> parse_header();
    PDFErrorOr<LinearizationResult> initialize_linearization_dict();
    PDFErrorOr<void> initialize_linearized_xref_table();
    PDFErrorOr<void> initialize_non_linearized_xref_table();
    PDFErrorOr<void> validate_xref_table_and_fix_if_necessary();
    PDFErrorOr<void> initialize_hint_tables();
    PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes);
    Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes);
    PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_stream();
    PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_table();
    PDFErrorOr<NonnullRefPtr<DictObject>> parse_file_trailer();
    PDFErrorOr<Value> parse_compressed_object_with_index(u32 index);

    bool navigate_to_before_eof_marker();
    bool navigate_to_after_startxref();

    RefPtr<XRefTable> m_xref_table;
    RefPtr<DictObject> m_trailer;
    Optional<LinearizationDictionary> m_linearization_dictionary;
};

}