summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibPDF/Parser.h
blob: 216c9e6da82b48e517a3afa8799cee3ae32fd841 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/*
 * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include <AK/NonnullRefPtrVector.h>
#include <LibPDF/Command.h>
#include <LibPDF/Object.h>
#include <LibPDF/Reader.h>
#include <LibPDF/XRefTable.h>

namespace PDF {

class Document;

class Parser {
public:
    static Vector<Command> parse_graphics_commands(const ReadonlyBytes&);

    Parser(Badge<Document>, const ReadonlyBytes&);

    void set_document(RefPtr<Document> document) { m_document = document; }

    bool perform_validation();

    struct XRefTableAndTrailer {
        XRefTable xref_table;
        NonnullRefPtr<DictObject> trailer;
    };
    XRefTableAndTrailer parse_last_xref_table_and_trailer();

    NonnullRefPtr<IndirectValue> parse_indirect_value_at_offset(size_t offset);

    RefPtr<DictObject> conditionally_parse_page_tree_node_at_offset(size_t offset);

private:
    explicit Parser(const ReadonlyBytes&);

    bool parse_header();
    XRefTable parse_xref_table();
    NonnullRefPtr<DictObject> parse_file_trailer();

    bool navigate_to_before_eof_marker();
    bool navigate_to_after_startxref();

    // If the PDF is linearized, the first object will be the linearization
    // parameter dictionary, and it will always occur within the first 1024 bytes.
    // We do a very sloppy and context-free search for this object. A return value
    // of true does not necessarily mean this PDF is linearized, but a return value
    // of false does mean this PDF is not linearized.
    // FIXME: false doesn't guarantee non-linearization, but we VERIFY the result!
    bool sloppy_is_linearized();

    String parse_comment();

    Value parse_value();
    Value parse_possible_indirect_value_or_ref();
    NonnullRefPtr<IndirectValue> parse_indirect_value(int index, int generation);
    NonnullRefPtr<IndirectValue> parse_indirect_value();
    Value parse_number();
    NonnullRefPtr<NameObject> parse_name();
    NonnullRefPtr<StringObject> parse_string();
    String parse_literal_string();
    String parse_hex_string();
    NonnullRefPtr<ArrayObject> parse_array();
    NonnullRefPtr<DictObject> parse_dict();
    NonnullRefPtr<StreamObject> parse_stream(NonnullRefPtr<DictObject> dict);

    Vector<Command> parse_graphics_commands();

    bool matches_eol() const;
    bool matches_whitespace() const;
    bool matches_number() const;
    bool matches_delimiter() const;
    bool matches_regular_character() const;

    void consume_eol();
    bool consume_whitespace();
    char consume();
    void consume(char);

    Reader m_reader;
    RefPtr<Document> m_document;
};

}