summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibPDF/Parser.h
blob: 2dba1a0515b2a384c0217088e91b6069b7c21055 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
 * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include <AK/NonnullRefPtrVector.h>
#include <LibPDF/Command.h>
#include <LibPDF/Object.h>
#include <LibPDF/Reader.h>
#include <LibPDF/XRefTable.h>

namespace PDF {

class Document;

class Parser final : public RefCounted<Parser> {
public:
    enum class LinearizationResult {
        Error,
        NotLinearized,
        Linearized,
    };

    static Vector<Command> parse_graphics_commands(ReadonlyBytes const&);

    Parser(Badge<Document>, ReadonlyBytes const&);

    [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
    void set_document(RefPtr<Document> const& document) { m_document = document; }

    // Parses the header and initializes the xref table and trailer
    bool initialize();

    Value parse_object_with_index(u32 index);

    // Specialized version of parse_dict which aborts early if the dict being parsed
    // is not a page object. A null RefPtr return indicates that the dict at this index
    // is not a page tree node, whereas ok == false indicates a malformed PDF file and
    // should cause an abort of the current operation.
    RefPtr<DictObject> conditionally_parse_page_tree_node(u32 object_index, bool& ok);

private:
    struct LinearizationDictionary {
        u32 length_of_file { 0 };
        u32 primary_hint_stream_offset { 0 };
        u32 primary_hint_stream_length { 0 };
        u32 overflow_hint_stream_offset { 0 };
        u32 overflow_hint_stream_length { 0 };
        u32 first_page_object_number { 0 };
        u32 offset_of_first_page_end { 0 };
        u16 number_of_pages { 0 };
        u32 offset_of_main_xref_table { 0 };
        u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here)
    };

    struct PageOffsetHintTable {
        u32 least_number_of_objects_in_a_page { 0 };
        u32 location_of_first_page_object { 0 };
        u16 bits_required_for_object_number { 0 };
        u32 least_length_of_a_page { 0 };
        u16 bits_required_for_page_length { 0 };
        u32 least_offset_of_any_content_stream { 0 };
        u16 bits_required_for_content_stream_offsets { 0 };
        u32 least_content_stream_length { 0 };
        u16 bits_required_for_content_stream_length { 0 };
        u16 bits_required_for_number_of_shared_obj_refs { 0 };
        u16 bits_required_for_greatest_shared_obj_identifier { 0 };
        u16 bits_required_for_fraction_numerator { 0 };
        u16 shared_object_reference_fraction_denominator { 0 };
    };

    struct PageOffsetHintTableEntry {
        u32 objects_in_page_number { 0 };
        u32 page_length_number { 0 };
        u32 number_of_shared_objects { 0 };
        Vector<u32> shared_object_identifiers {};
        Vector<u32> shared_object_location_numerators {};
        u32 page_content_stream_offset_number { 0 };
        u32 page_content_stream_length_number { 0 };
    };

    friend struct AK::Formatter<LinearizationDictionary>;
    friend struct AK::Formatter<PageOffsetHintTable>;
    friend struct AK::Formatter<PageOffsetHintTableEntry>;

    explicit Parser(ReadonlyBytes const&);

    bool parse_header();
    LinearizationResult initialize_linearization_dict();
    bool initialize_linearized_xref_table();
    bool initialize_non_linearized_xref_table();
    bool initialize_hint_tables();
    Optional<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes const& hint_stream_bytes);
    Optional<Vector<PageOffsetHintTableEntry>> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes const& hint_stream_bytes);
    RefPtr<XRefTable> parse_xref_table();
    RefPtr<DictObject> parse_file_trailer();

    bool navigate_to_before_eof_marker();
    bool navigate_to_after_startxref();

    // If the PDF is linearized, the first object will be the linearization
    // parameter dictionary, and it will always occur within the first 1024 bytes.
    // We do a very sloppy and context-free search for this object. A return value
    // of true does not necessarily mean this PDF is linearized, but a return value
    // of false does mean this PDF is not linearized.
    // FIXME: false doesn't guarantee non-linearization, but we VERIFY the result!
    bool sloppy_is_linearized();

    String parse_comment();

    Value parse_value();
    Value parse_possible_indirect_value_or_ref();
    RefPtr<IndirectValue> parse_indirect_value(int index, int generation);
    RefPtr<IndirectValue> parse_indirect_value();
    Value parse_number();
    RefPtr<NameObject> parse_name();
    RefPtr<StringObject> parse_string();
    String parse_literal_string();
    String parse_hex_string();
    RefPtr<ArrayObject> parse_array();
    RefPtr<DictObject> parse_dict();
    RefPtr<StreamObject> parse_stream(NonnullRefPtr<DictObject> dict);

    Vector<Command> parse_graphics_commands();

    bool matches_eol() const;
    bool matches_whitespace() const;
    bool matches_number() const;
    bool matches_delimiter() const;
    bool matches_regular_character() const;

    bool consume_eol();
    bool consume_whitespace();
    char consume();
    void consume(int amount);
    bool consume(char);

    Reader m_reader;
    RefPtr<Document> m_document;
    RefPtr<XRefTable> m_xref_table;
    RefPtr<DictObject> m_trailer;
    Optional<LinearizationDictionary> m_linearization_dictionary;
};

};